Include firstSeed and seedCount in crawl endpoints (#618)
This commit is contained in:
parent
c309b809da
commit
ed94dde7e6
@ -10,11 +10,11 @@ from typing import Optional, List, Dict, Union
|
|||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
from fastapi import Depends, HTTPException
|
from fastapi import Depends, HTTPException
|
||||||
from pydantic import BaseModel, UUID4, conint
|
from pydantic import BaseModel, UUID4, conint, HttpUrl
|
||||||
from redis import asyncio as aioredis, exceptions
|
from redis import asyncio as aioredis, exceptions
|
||||||
import pymongo
|
import pymongo
|
||||||
|
|
||||||
|
from .crawlconfigs import Seed
|
||||||
from .db import BaseMongoModel
|
from .db import BaseMongoModel
|
||||||
from .users import User
|
from .users import User
|
||||||
from .orgs import Organization, MAX_CRAWL_SCALE
|
from .orgs import Organization, MAX_CRAWL_SCALE
|
||||||
@ -91,11 +91,13 @@ class Crawl(BaseMongoModel):
|
|||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class CrawlOut(Crawl):
|
class CrawlOut(Crawl):
|
||||||
"""Output for single crawl, add configName and userName"""
|
"""Output for single crawl, with additional fields"""
|
||||||
|
|
||||||
userName: Optional[str]
|
userName: Optional[str]
|
||||||
configName: Optional[str]
|
configName: Optional[str]
|
||||||
resources: Optional[List[CrawlFileOut]] = []
|
resources: Optional[List[CrawlFileOut]] = []
|
||||||
|
firstSeed: Optional[str]
|
||||||
|
seedCount: Optional[int] = 0
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
@ -128,6 +130,9 @@ class ListCrawlOut(BaseMongoModel):
|
|||||||
|
|
||||||
notes: Optional[str]
|
notes: Optional[str]
|
||||||
|
|
||||||
|
firstSeed: Optional[str]
|
||||||
|
seedCount: Optional[int] = 0
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class ListCrawls(BaseModel):
|
class ListCrawls(BaseModel):
|
||||||
@ -252,6 +257,8 @@ class CrawlOps:
|
|||||||
|
|
||||||
results = await cursor.to_list(length=1000)
|
results = await cursor.to_list(length=1000)
|
||||||
crawls = [crawl_cls.from_dict(res) for res in results]
|
crawls = [crawl_cls.from_dict(res) for res in results]
|
||||||
|
crawls = [await self._resolve_crawl_refs(crawl, org) for crawl in crawls]
|
||||||
|
|
||||||
return crawls
|
return crawls
|
||||||
|
|
||||||
async def get_crawl_raw(self, crawlid: str, org: Organization):
|
async def get_crawl_raw(self, crawlid: str, org: Organization):
|
||||||
@ -285,7 +292,7 @@ class CrawlOps:
|
|||||||
return await self._resolve_crawl_refs(crawl, org)
|
return await self._resolve_crawl_refs(crawl, org)
|
||||||
|
|
||||||
async def _resolve_crawl_refs(
|
async def _resolve_crawl_refs(
|
||||||
self, crawl: Union[CrawlOut, ListCrawlOut], org: Organization
|
self, crawl: Union[CrawlOut, ListCrawlOut], org: Optional[Organization]
|
||||||
):
|
):
|
||||||
"""Resolve running crawl data"""
|
"""Resolve running crawl data"""
|
||||||
config = await self.crawl_configs.get_crawl_config(
|
config = await self.crawl_configs.get_crawl_config(
|
||||||
@ -293,7 +300,16 @@ class CrawlOps:
|
|||||||
)
|
)
|
||||||
|
|
||||||
if config:
|
if config:
|
||||||
crawl.configName = config.name
|
if not crawl.configName:
|
||||||
|
crawl.configName = config.name
|
||||||
|
|
||||||
|
if config.config.seeds:
|
||||||
|
first_seed = config.config.seeds[0]
|
||||||
|
if isinstance(first_seed, HttpUrl):
|
||||||
|
crawl.firstSeed = first_seed
|
||||||
|
elif isinstance(first_seed, Seed):
|
||||||
|
crawl.firstSeed = first_seed.url
|
||||||
|
crawl.seedCount = len(config.config.seeds)
|
||||||
|
|
||||||
user = await self.user_manager.get(crawl.userid)
|
user = await self.user_manager.get(crawl.userid)
|
||||||
if user:
|
if user:
|
||||||
|
@ -75,6 +75,38 @@ def test_crawl_info(admin_auth_headers, default_org_id, admin_crawl_id):
|
|||||||
assert data["fileSize"] == wacz_size
|
assert data["fileSize"] == wacz_size
|
||||||
|
|
||||||
|
|
||||||
|
def test_crawls_include_seed_info(admin_auth_headers, default_org_id, admin_crawl_id):
|
||||||
|
r = requests.get(
|
||||||
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
|
||||||
|
headers=admin_auth_headers,
|
||||||
|
)
|
||||||
|
data = r.json()
|
||||||
|
assert data["firstSeed"] == "https://webrecorder.net/"
|
||||||
|
assert data["seedCount"] == 1
|
||||||
|
|
||||||
|
r = requests.get(
|
||||||
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls",
|
||||||
|
headers=admin_auth_headers,
|
||||||
|
)
|
||||||
|
data = r.json()
|
||||||
|
crawls = data["crawls"]
|
||||||
|
assert crawls
|
||||||
|
for crawl in crawls:
|
||||||
|
assert crawl["firstSeed"]
|
||||||
|
assert crawl["seedCount"] > 0
|
||||||
|
|
||||||
|
r = requests.get(
|
||||||
|
f"{API_PREFIX}/orgs/all/crawls",
|
||||||
|
headers=admin_auth_headers,
|
||||||
|
)
|
||||||
|
data = r.json()
|
||||||
|
crawls = data["crawls"]
|
||||||
|
assert crawls
|
||||||
|
for crawl in crawls:
|
||||||
|
assert crawl["firstSeed"]
|
||||||
|
assert crawl["seedCount"] > 0
|
||||||
|
|
||||||
|
|
||||||
def test_download_wacz():
|
def test_download_wacz():
|
||||||
r = requests.get(HOST_PREFIX + wacz_path)
|
r = requests.get(HOST_PREFIX + wacz_path)
|
||||||
assert r.status_code == 200
|
assert r.status_code == 200
|
||||||
|
Loading…
Reference in New Issue
Block a user