Include firstSeed and seedCount in crawl endpoints (#618)

This commit is contained in:
Tessa Walsh 2023-02-22 10:27:31 -05:00 committed by GitHub
parent c309b809da
commit ed94dde7e6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 53 additions and 5 deletions

View File

@ -10,11 +10,11 @@ from typing import Optional, List, Dict, Union
from datetime import datetime, timedelta
from fastapi import Depends, HTTPException
from pydantic import BaseModel, UUID4, conint
from pydantic import BaseModel, UUID4, conint, HttpUrl
from redis import asyncio as aioredis, exceptions
import pymongo
from .crawlconfigs import Seed
from .db import BaseMongoModel
from .users import User
from .orgs import Organization, MAX_CRAWL_SCALE
@ -91,11 +91,13 @@ class Crawl(BaseMongoModel):
# ============================================================================
class CrawlOut(Crawl):
"""Output for single crawl, add configName and userName"""
"""Output for single crawl, with additional fields"""
userName: Optional[str]
configName: Optional[str]
resources: Optional[List[CrawlFileOut]] = []
firstSeed: Optional[str]
seedCount: Optional[int] = 0
# ============================================================================
@ -128,6 +130,9 @@ class ListCrawlOut(BaseMongoModel):
notes: Optional[str]
firstSeed: Optional[str]
seedCount: Optional[int] = 0
# ============================================================================
class ListCrawls(BaseModel):
@ -252,6 +257,8 @@ class CrawlOps:
results = await cursor.to_list(length=1000)
crawls = [crawl_cls.from_dict(res) for res in results]
crawls = [await self._resolve_crawl_refs(crawl, org) for crawl in crawls]
return crawls
async def get_crawl_raw(self, crawlid: str, org: Organization):
@ -285,7 +292,7 @@ class CrawlOps:
return await self._resolve_crawl_refs(crawl, org)
async def _resolve_crawl_refs(
self, crawl: Union[CrawlOut, ListCrawlOut], org: Organization
self, crawl: Union[CrawlOut, ListCrawlOut], org: Optional[Organization]
):
"""Resolve running crawl data"""
config = await self.crawl_configs.get_crawl_config(
@ -293,7 +300,16 @@ class CrawlOps:
)
if config:
crawl.configName = config.name
if not crawl.configName:
crawl.configName = config.name
if config.config.seeds:
first_seed = config.config.seeds[0]
if isinstance(first_seed, HttpUrl):
crawl.firstSeed = first_seed
elif isinstance(first_seed, Seed):
crawl.firstSeed = first_seed.url
crawl.seedCount = len(config.config.seeds)
user = await self.user_manager.get(crawl.userid)
if user:

View File

@ -75,6 +75,38 @@ def test_crawl_info(admin_auth_headers, default_org_id, admin_crawl_id):
assert data["fileSize"] == wacz_size
def test_crawls_include_seed_info(admin_auth_headers, default_org_id, admin_crawl_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
headers=admin_auth_headers,
)
data = r.json()
assert data["firstSeed"] == "https://webrecorder.net/"
assert data["seedCount"] == 1
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls",
headers=admin_auth_headers,
)
data = r.json()
crawls = data["crawls"]
assert crawls
for crawl in crawls:
assert crawl["firstSeed"]
assert crawl["seedCount"] > 0
r = requests.get(
f"{API_PREFIX}/orgs/all/crawls",
headers=admin_auth_headers,
)
data = r.json()
crawls = data["crawls"]
assert crawls
for crawl in crawls:
assert crawl["firstSeed"]
assert crawl["seedCount"] > 0
def test_download_wacz():
r = requests.get(HOST_PREFIX + wacz_path)
assert r.status_code == 200