Add reviewStatus, qaState, and qaRunCount sort options to crawls/all-crawls list endpoints (#1686)

Backend work for #1672 

Adds new sort options to /crawls and /all-crawls GET list endpoints:

- `reviewStatus`
- `qaRunCount`: number of completed QA runs for crawl (also added to
CrawlOut)
- `qaState` (sorts by `activeQAState` first, then `lastQAState`, both of
which are added to CrawlOut)
This commit is contained in:
Tessa Walsh 2024-04-17 02:54:09 -04:00 committed by GitHub
parent 87e0873f1a
commit c800da1732
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 267 additions and 3 deletions

View File

@ -551,6 +551,50 @@ class BaseCrawlOps:
{"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}}, {"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}},
{"$set": {"firstSeed": "$firstSeedObject.url"}}, {"$set": {"firstSeed": "$firstSeedObject.url"}},
{"$unset": ["firstSeedObject", "errors", "config"]}, {"$unset": ["firstSeedObject", "errors", "config"]},
{"$set": {"qaState": "$qa.state"}},
{"$set": {"activeQAState": "$qaState"}},
{
"$set": {
"qaFinishedArray": {
"$map": {
"input": {"$objectToArray": "$qaFinished"},
"in": "$$this.v",
}
}
}
},
{
"$set": {
"sortedQARuns": {
"$sortArray": {
"input": "$qaFinishedArray",
"sortBy": {"started": -1},
}
}
}
},
{"$set": {"lastQARun": {"$arrayElemAt": ["$sortedQARuns", 0]}}},
{"$set": {"lastQAState": "$lastQARun.state"}},
{
"$set": {
"qaRunCount": {
"$size": {
"$cond": [
{"$isArray": "$qaFinishedArray"},
"$qaFinishedArray",
[],
]
}
}
}
},
{
"$unset": [
"lastQARun",
"qaFinishedArray",
"sortedQARuns",
]
},
] ]
if not resources: if not resources:
@ -569,12 +613,25 @@ class BaseCrawlOps:
aggregate.extend([{"$match": {"collectionIds": {"$in": [collection_id]}}}]) aggregate.extend([{"$match": {"collectionIds": {"$in": [collection_id]}}}])
if sort_by: if sort_by:
if sort_by not in ("started", "finished", "fileSize"): if sort_by not in (
"started",
"finished",
"fileSize",
"reviewStatus",
"qaRunCount",
"qaState",
):
raise HTTPException(status_code=400, detail="invalid_sort_by") raise HTTPException(status_code=400, detail="invalid_sort_by")
if sort_direction not in (1, -1): if sort_direction not in (1, -1):
raise HTTPException(status_code=400, detail="invalid_sort_direction") raise HTTPException(status_code=400, detail="invalid_sort_direction")
aggregate.extend([{"$sort": {sort_by: sort_direction}}]) sort_query = {sort_by: sort_direction}
# Add secondary sort for qaState - sorted by current, then last
if sort_by == "qaState":
sort_query["lastQAState"] = sort_direction
aggregate.extend([{"$sort": sort_query}])
aggregate.extend( aggregate.extend(
[ [

View File

@ -165,6 +165,50 @@ class CrawlOps(BaseCrawlOps):
{"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}}, {"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}},
{"$set": {"firstSeed": "$firstSeedObject.url"}}, {"$set": {"firstSeed": "$firstSeedObject.url"}},
{"$unset": ["firstSeedObject", "errors", "config"]}, {"$unset": ["firstSeedObject", "errors", "config"]},
{"$set": {"qaState": "$qa.state"}},
{"$set": {"activeQAState": "$qaState"}},
{
"$set": {
"qaFinishedArray": {
"$map": {
"input": {"$objectToArray": "$qaFinished"},
"in": "$$this.v",
}
}
}
},
{
"$set": {
"sortedQARuns": {
"$sortArray": {
"input": "$qaFinishedArray",
"sortBy": {"started": -1},
}
}
}
},
{"$set": {"lastQARun": {"$arrayElemAt": ["$sortedQARuns", 0]}}},
{"$set": {"lastQAState": "$lastQARun.state"}},
{
"$set": {
"qaRunCount": {
"$size": {
"$cond": [
{"$isArray": "$qaFinishedArray"},
"$qaFinishedArray",
[],
]
}
}
}
},
{
"$unset": [
"lastQARun",
"qaFinishedArray",
"sortedQARuns",
]
},
] ]
if not resources: if not resources:
@ -188,12 +232,21 @@ class CrawlOps(BaseCrawlOps):
"finished", "finished",
"fileSize", "fileSize",
"firstSeed", "firstSeed",
"reviewStatus",
"qaRunCount",
"qaState",
): ):
raise HTTPException(status_code=400, detail="invalid_sort_by") raise HTTPException(status_code=400, detail="invalid_sort_by")
if sort_direction not in (1, -1): if sort_direction not in (1, -1):
raise HTTPException(status_code=400, detail="invalid_sort_direction") raise HTTPException(status_code=400, detail="invalid_sort_direction")
aggregate.extend([{"$sort": {sort_by: sort_direction}}]) sort_query = {sort_by: sort_direction}
# Add secondary sort for qaState - sorted by current, then last
if sort_by == "qaState":
sort_query["lastQAState"] = sort_direction
aggregate.extend([{"$sort": sort_query}])
aggregate.extend( aggregate.extend(
[ [

View File

@ -658,6 +658,10 @@ class CrawlOut(BaseMongoModel):
reviewStatus: Optional[conint(ge=1, le=5)] = None # type: ignore reviewStatus: Optional[conint(ge=1, le=5)] = None # type: ignore
qaRunCount: int = 0
activeQAState: Optional[str]
lastQAState: Optional[str]
# ============================================================================ # ============================================================================
class CrawlOutWithResources(CrawlOut): class CrawlOutWithResources(CrawlOut):

View File

@ -116,6 +116,28 @@ def failed_qa_run_id(crawler_crawl_id, crawler_auth_headers, default_org_id):
assert qa["started"] assert qa["started"]
assert not qa["finished"] assert not qa["finished"]
# Ensure sorting by qaState works as expected - current floated to top
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls?sortBy=qaState",
headers=crawler_auth_headers,
)
assert r.status_code == 200
crawls = r.json()["items"]
assert crawls[0]["id"] == crawler_crawl_id
assert crawls[0]["activeQAState"]
assert crawls[0]["lastQAState"]
# Ensure sorting by qaState works as expected with all-crawls
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=qaState",
headers=crawler_auth_headers,
)
assert r.status_code == 200
crawls = r.json()["items"]
assert crawls[0]["id"] == crawler_crawl_id
assert crawls[0]["activeQAState"]
assert crawls[0]["lastQAState"]
# Cancel crawl # Cancel crawl
r = requests.post( r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/cancel", f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/cancel",
@ -340,6 +362,96 @@ def test_failed_qa_run(
assert qa["crawlExecSeconds"] > 0 assert qa["crawlExecSeconds"] > 0
def test_sort_crawls_by_qa_runs(
crawler_crawl_id,
crawler_auth_headers,
default_org_id,
failed_qa_run_id,
qa_run_pages_ready,
):
# Test that sorting by qaRunCount works as expected
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls?sortBy=qaRunCount",
headers=crawler_auth_headers,
)
assert r.status_code == 200
crawls = r.json()["items"]
assert crawls[0]["id"] == crawler_crawl_id
qa_run_count = crawls[0]["qaRunCount"]
assert qa_run_count > 0
last_count = qa_run_count
for crawl in crawls:
if crawl["id"] == crawler_crawl_id:
continue
crawl_qa_count = crawl["qaRunCount"]
assert isinstance(crawl_qa_count, int)
assert crawl_qa_count <= last_count
last_count = crawl_qa_count
# Test ascending sort
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls?sortBy=qaRunCount&sortDirection=1",
headers=crawler_auth_headers,
)
assert r.status_code == 200
crawls = r.json()["items"]
assert crawls[-1]["id"] == crawler_crawl_id
assert crawls[-1]["qaRunCount"] > 0
last_count = 0
for crawl in crawls:
if crawl["id"] == crawler_crawl_id:
continue
crawl_qa_count = crawl["qaRunCount"]
assert isinstance(crawl_qa_count, int)
assert crawl_qa_count >= last_count
last_count = crawl_qa_count
# Test same with all-crawls
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=qaRunCount",
headers=crawler_auth_headers,
)
assert r.status_code == 200
crawls = r.json()["items"]
assert crawls[0]["id"] == crawler_crawl_id
qa_run_count = crawls[0]["qaRunCount"]
assert qa_run_count > 0
last_count = qa_run_count
for crawl in crawls:
if crawl["id"] == crawler_crawl_id:
continue
crawl_qa_count = crawl["qaRunCount"]
assert isinstance(crawl_qa_count, int)
assert crawl_qa_count <= last_count
last_count = crawl_qa_count
# Test ascending sort
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=qaRunCount&sortDirection=1",
headers=crawler_auth_headers,
)
assert r.status_code == 200
crawls = r.json()["items"]
assert crawls[-1]["id"] == crawler_crawl_id
assert crawls[-1]["qaRunCount"] > 0
last_count = 0
for crawl in crawls:
if crawl["id"] == crawler_crawl_id:
continue
crawl_qa_count = crawl["qaRunCount"]
assert isinstance(crawl_qa_count, int)
assert crawl_qa_count >= last_count
last_count = crawl_qa_count
def test_delete_qa_runs( def test_delete_qa_runs(
crawler_crawl_id, crawler_crawl_id,
crawler_auth_headers, crawler_auth_headers,

View File

@ -303,6 +303,44 @@ def test_update_crawl(
assert r.status_code == 200 assert r.status_code == 200
assert r.json()["reviewStatus"] == 5 assert r.json()["reviewStatus"] == 5
# Test sorting on reviewStatus
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls?sortBy=reviewStatus",
headers=admin_auth_headers,
)
assert r.status_code == 200
crawls = r.json()["items"]
assert crawls[0]["id"] == admin_crawl_id
assert crawls[0]["reviewStatus"] == 5
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls?sortBy=reviewStatus&sortDirection=1",
headers=admin_auth_headers,
)
assert r.status_code == 200
crawls = r.json()["items"]
assert crawls[-1]["id"] == admin_crawl_id
assert crawls[-1]["reviewStatus"] == 5
# Test sorting on reviewStatus for all-crawls
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=reviewStatus",
headers=admin_auth_headers,
)
assert r.status_code == 200
crawls = r.json()["items"]
assert crawls[0]["id"] == admin_crawl_id
assert crawls[0]["reviewStatus"] == 5
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=reviewStatus&sortDirection=1",
headers=admin_auth_headers,
)
assert r.status_code == 200
crawls = r.json()["items"]
assert crawls[-1]["id"] == admin_crawl_id
assert crawls[-1]["reviewStatus"] == 5
# Try to update to invalid reviewStatus # Try to update to invalid reviewStatus
r = requests.patch( r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}", f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",