Dynamically calculate crawl stats for crawlconfig endpoints (#623)
This commit is contained in:
parent
cbab425fec
commit
567e851235
@ -212,28 +212,15 @@ class CrawlJob(ABC):
|
|||||||
await self.update_crawl(state=state, finished=self.finished)
|
await self.update_crawl(state=state, finished=self.finished)
|
||||||
|
|
||||||
if completed:
|
if completed:
|
||||||
await self.inc_crawl_complete_stats(state)
|
await self.inc_crawl_complete_stats()
|
||||||
|
|
||||||
async def inc_crawl_complete_stats(self, state):
|
async def inc_crawl_complete_stats(self):
|
||||||
"""Increment Crawl Stats"""
|
"""Increment Crawl Stats"""
|
||||||
|
|
||||||
duration = int((self.finished - self.started).total_seconds())
|
duration = int((self.finished - self.started).total_seconds())
|
||||||
|
|
||||||
print(f"Duration: {duration}", flush=True)
|
print(f"Duration: {duration}", flush=True)
|
||||||
|
|
||||||
# init crawl config stats
|
|
||||||
await self.crawl_configs.find_one_and_update(
|
|
||||||
{"_id": self.cid, "inactive": {"$ne": True}},
|
|
||||||
{
|
|
||||||
"$inc": {"crawlCount": 1},
|
|
||||||
"$set": {
|
|
||||||
"lastCrawlId": self.job_id,
|
|
||||||
"lastCrawlTime": self.finished,
|
|
||||||
"lastCrawlState": state,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
# init org crawl stats
|
# init org crawl stats
|
||||||
yymm = datetime.utcnow().strftime("%Y-%m")
|
yymm = datetime.utcnow().strftime("%Y-%m")
|
||||||
await self.orgs.find_one_and_update(
|
await self.orgs.find_one_and_update(
|
||||||
|
|||||||
@ -139,8 +139,11 @@ class CrawlConfig(BaseMongoModel):
|
|||||||
|
|
||||||
crawlAttemptCount: Optional[int] = 0
|
crawlAttemptCount: Optional[int] = 0
|
||||||
|
|
||||||
|
# These fields would ideally be in CrawlConfigOut, but are being
|
||||||
|
# kept here to prevent the need for a migration. Eventually, we
|
||||||
|
# may want to add a migration and move them, as these values are
|
||||||
|
# now generated dynamically in API endpoints as needed.
|
||||||
crawlCount: Optional[int] = 0
|
crawlCount: Optional[int] = 0
|
||||||
|
|
||||||
lastCrawlId: Optional[str]
|
lastCrawlId: Optional[str]
|
||||||
lastCrawlTime: Optional[datetime]
|
lastCrawlTime: Optional[datetime]
|
||||||
lastCrawlState: Optional[str]
|
lastCrawlState: Optional[str]
|
||||||
@ -400,6 +403,7 @@ class CrawlConfigOps:
|
|||||||
configs = []
|
configs = []
|
||||||
for res in results:
|
for res in results:
|
||||||
config = CrawlConfigOut.from_dict(res)
|
config = CrawlConfigOut.from_dict(res)
|
||||||
|
config = await self._annotate_with_crawl_stats(config)
|
||||||
# pylint: disable=invalid-name
|
# pylint: disable=invalid-name
|
||||||
config.currCrawlId = running.get(config.id)
|
config.currCrawlId = running.get(config.id)
|
||||||
configs.append(config)
|
configs.append(config)
|
||||||
@ -430,6 +434,25 @@ class CrawlConfigOps:
|
|||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
async def _annotate_with_crawl_stats(self, crawlconfig: CrawlConfigOut):
|
||||||
|
"""Annotate crawlconfig with information about associated crawls"""
|
||||||
|
crawls = await self.crawl_ops.list_crawls(cid=crawlconfig.id)
|
||||||
|
|
||||||
|
crawlconfig.crawlCount = len(crawls)
|
||||||
|
|
||||||
|
finished_crawls = [crawl for crawl in crawls if crawl.finished]
|
||||||
|
if not finished_crawls:
|
||||||
|
return crawlconfig
|
||||||
|
|
||||||
|
sorted_crawls = sorted(finished_crawls, key=lambda crawl: crawl.finished)
|
||||||
|
last_crawl = sorted_crawls[-1]
|
||||||
|
|
||||||
|
crawlconfig.lastCrawlId = str(last_crawl.id)
|
||||||
|
crawlconfig.lastCrawlTime = last_crawl.finished
|
||||||
|
crawlconfig.lastCrawlState = last_crawl.state
|
||||||
|
|
||||||
|
return crawlconfig
|
||||||
|
|
||||||
async def get_crawl_config_out(self, cid: uuid.UUID, org: Organization):
|
async def get_crawl_config_out(self, cid: uuid.UUID, org: Organization):
|
||||||
"""Return CrawlConfigOut, including state of currently running crawl, if active
|
"""Return CrawlConfigOut, including state of currently running crawl, if active
|
||||||
also include inactive crawl configs"""
|
also include inactive crawl configs"""
|
||||||
@ -455,6 +478,8 @@ class CrawlConfigOps:
|
|||||||
crawlconfig.profileid, org
|
crawlconfig.profileid, org
|
||||||
)
|
)
|
||||||
|
|
||||||
|
crawlconfig = await self._annotate_with_crawl_stats(crawlconfig)
|
||||||
|
|
||||||
return crawlconfig
|
return crawlconfig
|
||||||
|
|
||||||
async def get_crawl_config(
|
async def get_crawl_config(
|
||||||
|
|||||||
@ -105,3 +105,48 @@ def crawl_id_wr_specs(admin_auth_headers, default_org_id):
|
|||||||
if data["state"] == "complete":
|
if data["state"] == "complete":
|
||||||
return crawl_id
|
return crawl_id
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def crawl_config_info(admin_auth_headers, default_org_id):
|
||||||
|
# Start crawl.
|
||||||
|
crawl_data = {
|
||||||
|
"runNow": True,
|
||||||
|
"name": "Crawl config test",
|
||||||
|
"config": {"seeds": ["https://specs.webrecorder.net/"], "limit": 1},
|
||||||
|
}
|
||||||
|
r = requests.post(
|
||||||
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
||||||
|
headers=admin_auth_headers,
|
||||||
|
json=crawl_data,
|
||||||
|
)
|
||||||
|
data = r.json()
|
||||||
|
|
||||||
|
crawl_config_id = data["added"]
|
||||||
|
crawl_id = data["run_now_job"]
|
||||||
|
while True:
|
||||||
|
r = requests.get(
|
||||||
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
|
||||||
|
headers=admin_auth_headers,
|
||||||
|
)
|
||||||
|
data = r.json()
|
||||||
|
if data["state"] == "complete":
|
||||||
|
break
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
# Run second crawl from crawlconfig and return info when it finishes
|
||||||
|
r = requests.post(
|
||||||
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawl_config_id}/run",
|
||||||
|
headers=admin_auth_headers,
|
||||||
|
)
|
||||||
|
data = r.json()
|
||||||
|
second_crawl_id = data["started"]
|
||||||
|
while True:
|
||||||
|
r = requests.get(
|
||||||
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{second_crawl_id}/replay.json",
|
||||||
|
headers=admin_auth_headers,
|
||||||
|
)
|
||||||
|
data = r.json()
|
||||||
|
if data["state"] == "complete":
|
||||||
|
return (crawl_config_id, crawl_id, second_crawl_id)
|
||||||
|
time.sleep(5)
|
||||||
|
|||||||
84
backend/test_nightly/test_crawlconfig_crawl_stats.py
Normal file
84
backend/test_nightly/test_crawlconfig_crawl_stats.py
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
import requests
|
||||||
|
|
||||||
|
from .conftest import API_PREFIX
|
||||||
|
|
||||||
|
|
||||||
|
def test_crawlconfig_crawl_stats(admin_auth_headers, default_org_id, crawl_config_info):
|
||||||
|
crawl_config_id, crawl_id, second_crawl_id = crawl_config_info
|
||||||
|
|
||||||
|
r = requests.get(
|
||||||
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}",
|
||||||
|
headers=admin_auth_headers,
|
||||||
|
)
|
||||||
|
assert r.status_code == 200
|
||||||
|
data = r.json()
|
||||||
|
first_crawl_finished = data["finished"]
|
||||||
|
assert first_crawl_finished
|
||||||
|
|
||||||
|
r = requests.get(
|
||||||
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{second_crawl_id}",
|
||||||
|
headers=admin_auth_headers,
|
||||||
|
)
|
||||||
|
assert r.status_code == 200
|
||||||
|
data = r.json()
|
||||||
|
second_crawl_finished = data["finished"]
|
||||||
|
assert second_crawl_finished
|
||||||
|
|
||||||
|
# Verify crawl stats from /crawlconfigs
|
||||||
|
r = requests.get(
|
||||||
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawl_config_id}",
|
||||||
|
headers=admin_auth_headers,
|
||||||
|
)
|
||||||
|
assert r.status_code == 200
|
||||||
|
data = r.json()
|
||||||
|
assert data["crawlAttemptCount"] == 2
|
||||||
|
assert data["crawlCount"] == 2
|
||||||
|
assert data["lastCrawlId"] == second_crawl_id
|
||||||
|
assert data["lastCrawlState"] == "complete"
|
||||||
|
assert data["lastCrawlTime"] == second_crawl_finished
|
||||||
|
|
||||||
|
# Delete second crawl
|
||||||
|
r = requests.post(
|
||||||
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
|
||||||
|
headers=admin_auth_headers,
|
||||||
|
json={"crawl_ids": [second_crawl_id]},
|
||||||
|
)
|
||||||
|
assert r.status_code == 200
|
||||||
|
data = r.json()
|
||||||
|
assert data["deleted"]
|
||||||
|
|
||||||
|
# Verify crawl stats from /crawlconfigs
|
||||||
|
r = requests.get(
|
||||||
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawl_config_id}",
|
||||||
|
headers=admin_auth_headers,
|
||||||
|
)
|
||||||
|
assert r.status_code == 200
|
||||||
|
data = r.json()
|
||||||
|
assert data["crawlAttemptCount"] == 2
|
||||||
|
assert data["crawlCount"] == 1
|
||||||
|
assert data["lastCrawlId"] == first_crawl_id
|
||||||
|
assert data["lastCrawlState"] == "complete"
|
||||||
|
assert data["lastCrawlTime"] == first_crawl_finished
|
||||||
|
|
||||||
|
# Delete first crawl
|
||||||
|
r = requests.post(
|
||||||
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
|
||||||
|
headers=admin_auth_headers,
|
||||||
|
json={"crawl_ids": [crawl_id]},
|
||||||
|
)
|
||||||
|
assert r.status_code == 200
|
||||||
|
data = r.json()
|
||||||
|
assert data["deleted"]
|
||||||
|
|
||||||
|
# Verify crawl stats from /crawlconfigs
|
||||||
|
r = requests.get(
|
||||||
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawl_config_id}",
|
||||||
|
headers=admin_auth_headers,
|
||||||
|
)
|
||||||
|
assert r.status_code == 200
|
||||||
|
data = r.json()
|
||||||
|
assert data["crawlAttemptCount"] == 2
|
||||||
|
assert data["crawlCount"] == 0
|
||||||
|
assert not data["lastCrawlId"]
|
||||||
|
assert not data["lastCrawlState"]
|
||||||
|
assert not data["lastCrawlTime"]
|
||||||
Loading…
Reference in New Issue
Block a user