diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 94026128..91283c99 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -10,11 +10,11 @@ from typing import Optional, List, Dict, Union from datetime import datetime, timedelta from fastapi import Depends, HTTPException -from pydantic import BaseModel, UUID4, conint +from pydantic import BaseModel, UUID4, conint, HttpUrl from redis import asyncio as aioredis, exceptions import pymongo - +from .crawlconfigs import Seed from .db import BaseMongoModel from .users import User from .orgs import Organization, MAX_CRAWL_SCALE @@ -91,11 +91,13 @@ class Crawl(BaseMongoModel): # ============================================================================ class CrawlOut(Crawl): - """Output for single crawl, add configName and userName""" + """Output for single crawl, with additional fields""" userName: Optional[str] configName: Optional[str] resources: Optional[List[CrawlFileOut]] = [] + firstSeed: Optional[str] + seedCount: Optional[int] = 0 # ============================================================================ @@ -128,6 +130,9 @@ class ListCrawlOut(BaseMongoModel): notes: Optional[str] + firstSeed: Optional[str] + seedCount: Optional[int] = 0 + # ============================================================================ class ListCrawls(BaseModel): @@ -252,6 +257,8 @@ class CrawlOps: results = await cursor.to_list(length=1000) crawls = [crawl_cls.from_dict(res) for res in results] + crawls = [await self._resolve_crawl_refs(crawl, org) for crawl in crawls] + return crawls async def get_crawl_raw(self, crawlid: str, org: Organization): @@ -285,7 +292,7 @@ class CrawlOps: return await self._resolve_crawl_refs(crawl, org) async def _resolve_crawl_refs( - self, crawl: Union[CrawlOut, ListCrawlOut], org: Organization + self, crawl: Union[CrawlOut, ListCrawlOut], org: Optional[Organization] ): """Resolve running crawl data""" config = await self.crawl_configs.get_crawl_config( @@ -293,7 +300,16 @@ class CrawlOps: ) if config: - crawl.configName = config.name + if not crawl.configName: + crawl.configName = config.name + + if config.config.seeds: + first_seed = config.config.seeds[0] + if isinstance(first_seed, HttpUrl): + crawl.firstSeed = first_seed + elif isinstance(first_seed, Seed): + crawl.firstSeed = first_seed.url + crawl.seedCount = len(config.config.seeds) user = await self.user_manager.get(crawl.userid) if user: diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index f3b19548..2d939f78 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -75,6 +75,38 @@ def test_crawl_info(admin_auth_headers, default_org_id, admin_crawl_id): assert data["fileSize"] == wacz_size +def test_crawls_include_seed_info(admin_auth_headers, default_org_id, admin_crawl_id): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}", + headers=admin_auth_headers, + ) + data = r.json() + assert data["firstSeed"] == "https://webrecorder.net/" + assert data["seedCount"] == 1 + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls", + headers=admin_auth_headers, + ) + data = r.json() + crawls = data["crawls"] + assert crawls + for crawl in crawls: + assert crawl["firstSeed"] + assert crawl["seedCount"] > 0 + + r = requests.get( + f"{API_PREFIX}/orgs/all/crawls", + headers=admin_auth_headers, + ) + data = r.json() + crawls = data["crawls"] + assert crawls + for crawl in crawls: + assert crawl["firstSeed"] + assert crawl["seedCount"] > 0 + + def test_download_wacz(): r = requests.get(HOST_PREFIX + wacz_path) assert r.status_code == 200