Support sorting by last QA started time (#1712)

To support #1683, it would be useful to be able to sort by 'last QA
start time' in addition to/instead of last QA state.
- make sorting consistent with workflow sorting
- sortBy fields renamed to lastQAState and lastQAStarted
- Current QA runs are now included in the lastQAState/lastQAStarted fields, rather than being separated out to different values

---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
Ilya Kreymer 2024-04-22 13:00:52 -07:00 committed by GitHub
parent b574f00d2b
commit 1844e761dc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 89 additions and 32 deletions

View File

@ -551,8 +551,6 @@ class BaseCrawlOps:
{"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}}, {"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}},
{"$set": {"firstSeed": "$firstSeedObject.url"}}, {"$set": {"firstSeed": "$firstSeedObject.url"}},
{"$unset": ["firstSeedObject", "errors", "config"]}, {"$unset": ["firstSeedObject", "errors", "config"]},
{"$set": {"qaState": "$qa.state"}},
{"$set": {"activeQAState": "$qaState"}},
{"$set": {"activeQAStats": "$qa.stats"}}, {"$set": {"activeQAStats": "$qa.stats"}},
{ {
"$set": { "$set": {
@ -564,11 +562,23 @@ class BaseCrawlOps:
} }
} }
}, },
# Add active QA run to array if exists prior to sorting, taking care not to
# pass null to $concatArrays so that our result isn't null
{
"$set": {
"qaActiveArray": {"$cond": [{"$ne": ["$qa", None]}, ["$qa"], []]}
}
},
{
"$set": {
"qaArray": {"$concatArrays": ["$qaFinishedArray", "$qaActiveArray"]}
}
},
{ {
"$set": { "$set": {
"sortedQARuns": { "sortedQARuns": {
"$sortArray": { "$sortArray": {
"input": "$qaFinishedArray", "input": "$qaArray",
"sortBy": {"started": -1}, "sortBy": {"started": -1},
} }
} }
@ -576,13 +586,14 @@ class BaseCrawlOps:
}, },
{"$set": {"lastQARun": {"$arrayElemAt": ["$sortedQARuns", 0]}}}, {"$set": {"lastQARun": {"$arrayElemAt": ["$sortedQARuns", 0]}}},
{"$set": {"lastQAState": "$lastQARun.state"}}, {"$set": {"lastQAState": "$lastQARun.state"}},
{"$set": {"lastQAStarted": "$lastQARun.started"}},
{ {
"$set": { "$set": {
"qaRunCount": { "qaRunCount": {
"$size": { "$size": {
"$cond": [ "$cond": [
{"$isArray": "$qaFinishedArray"}, {"$isArray": "$qaArray"},
"$qaFinishedArray", "$qaArray",
[], [],
] ]
} }
@ -592,7 +603,9 @@ class BaseCrawlOps:
{ {
"$unset": [ "$unset": [
"lastQARun", "lastQARun",
"qaActiveArray",
"qaFinishedArray", "qaFinishedArray",
"qaArray",
"sortedQARuns", "sortedQARuns",
] ]
}, },
@ -619,8 +632,9 @@ class BaseCrawlOps:
"finished", "finished",
"fileSize", "fileSize",
"reviewStatus", "reviewStatus",
"lastQAStarted",
"lastQAState",
"qaRunCount", "qaRunCount",
"qaState",
): ):
raise HTTPException(status_code=400, detail="invalid_sort_by") raise HTTPException(status_code=400, detail="invalid_sort_by")
if sort_direction not in (1, -1): if sort_direction not in (1, -1):
@ -628,10 +642,8 @@ class BaseCrawlOps:
sort_query = {sort_by: sort_direction} sort_query = {sort_by: sort_direction}
# Secondary sort for qaState - sorted by current, then last # Ensure crawls are always sorted first for QA-related sorts
# Tertiary sort for qaState - type, always ascending so crawls are first if sort_by in ("lastQAStarted", "lastQAState"):
if sort_by == "qaState":
sort_query["lastQAState"] = sort_direction
sort_query["type"] = 1 sort_query["type"] = 1
aggregate.extend([{"$sort": sort_query}]) aggregate.extend([{"$sort": sort_query}])

View File

@ -169,8 +169,6 @@ class CrawlOps(BaseCrawlOps):
{"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}}, {"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}},
{"$set": {"firstSeed": "$firstSeedObject.url"}}, {"$set": {"firstSeed": "$firstSeedObject.url"}},
{"$unset": ["firstSeedObject", "errors", "config"]}, {"$unset": ["firstSeedObject", "errors", "config"]},
{"$set": {"qaState": "$qa.state"}},
{"$set": {"activeQAState": "$qaState"}},
{"$set": {"activeQAStats": "$qa.stats"}}, {"$set": {"activeQAStats": "$qa.stats"}},
{ {
"$set": { "$set": {
@ -182,11 +180,23 @@ class CrawlOps(BaseCrawlOps):
} }
} }
}, },
# Add active QA run to array if exists prior to sorting, taking care not to
# pass null to $concatArrays so that our result isn't null
{
"$set": {
"qaActiveArray": {"$cond": [{"$ne": ["$qa", None]}, ["$qa"], []]}
}
},
{
"$set": {
"qaArray": {"$concatArrays": ["$qaFinishedArray", "$qaActiveArray"]}
}
},
{ {
"$set": { "$set": {
"sortedQARuns": { "sortedQARuns": {
"$sortArray": { "$sortArray": {
"input": "$qaFinishedArray", "input": "$qaArray",
"sortBy": {"started": -1}, "sortBy": {"started": -1},
} }
} }
@ -194,13 +204,14 @@ class CrawlOps(BaseCrawlOps):
}, },
{"$set": {"lastQARun": {"$arrayElemAt": ["$sortedQARuns", 0]}}}, {"$set": {"lastQARun": {"$arrayElemAt": ["$sortedQARuns", 0]}}},
{"$set": {"lastQAState": "$lastQARun.state"}}, {"$set": {"lastQAState": "$lastQARun.state"}},
{"$set": {"lastQAStarted": "$lastQARun.started"}},
{ {
"$set": { "$set": {
"qaRunCount": { "qaRunCount": {
"$size": { "$size": {
"$cond": [ "$cond": [
{"$isArray": "$qaFinishedArray"}, {"$isArray": "$qaArray"},
"$qaFinishedArray", "$qaArray",
[], [],
] ]
} }
@ -210,7 +221,9 @@ class CrawlOps(BaseCrawlOps):
{ {
"$unset": [ "$unset": [
"lastQARun", "lastQARun",
"qaActiveArray",
"qaFinishedArray", "qaFinishedArray",
"qaArray",
"sortedQARuns", "sortedQARuns",
] ]
}, },
@ -239,19 +252,14 @@ class CrawlOps(BaseCrawlOps):
"firstSeed", "firstSeed",
"reviewStatus", "reviewStatus",
"qaRunCount", "qaRunCount",
"qaState", "lastQAState",
"lastQAStarted",
): ):
raise HTTPException(status_code=400, detail="invalid_sort_by") raise HTTPException(status_code=400, detail="invalid_sort_by")
if sort_direction not in (1, -1): if sort_direction not in (1, -1):
raise HTTPException(status_code=400, detail="invalid_sort_direction") raise HTTPException(status_code=400, detail="invalid_sort_direction")
sort_query = {sort_by: sort_direction} aggregate.extend([{"$sort": {sort_by: sort_direction}}])
# Add secondary sort for qaState - sorted by current, then last
if sort_by == "qaState":
sort_query["lastQAState"] = sort_direction
aggregate.extend([{"$sort": sort_query}])
aggregate.extend( aggregate.extend(
[ [

View File

@ -660,9 +660,9 @@ class CrawlOut(BaseMongoModel):
reviewStatus: Optional[conint(ge=1, le=5)] = None # type: ignore reviewStatus: Optional[conint(ge=1, le=5)] = None # type: ignore
qaRunCount: int = 0 qaRunCount: int = 0
activeQAState: Optional[str]
activeQAStats: Optional[CrawlStats] activeQAStats: Optional[CrawlStats]
lastQAState: Optional[str] lastQAState: Optional[str]
lastQAStarted: Optional[datetime]
# ============================================================================ # ============================================================================

View File

@ -116,29 +116,53 @@ def failed_qa_run_id(crawler_crawl_id, crawler_auth_headers, default_org_id):
assert qa["started"] assert qa["started"]
assert not qa["finished"] assert not qa["finished"]
# Ensure sorting by qaState works as expected - current floated to top # Ensure sorting by lastQAState works as expected - current floated to top
r = requests.get( r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls?sortBy=qaState", f"{API_PREFIX}/orgs/{default_org_id}/crawls?sortBy=lastQAState",
headers=crawler_auth_headers, headers=crawler_auth_headers,
) )
assert r.status_code == 200 assert r.status_code == 200
crawls = r.json()["items"] crawls = r.json()["items"]
assert crawls[0]["id"] == crawler_crawl_id assert crawls[0]["id"] == crawler_crawl_id
assert crawls[0]["activeQAState"]
assert crawls[0]["activeQAStats"] assert crawls[0]["activeQAStats"]
assert crawls[0]["lastQAState"] assert crawls[0]["lastQAState"]
assert crawls[0]["lastQAStarted"]
# Ensure sorting by qaState works as expected with all-crawls # Ensure sorting by lastQAState works as expected with all-crawls
r = requests.get( r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=qaState", f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=lastQAState",
headers=crawler_auth_headers, headers=crawler_auth_headers,
) )
assert r.status_code == 200 assert r.status_code == 200
crawls = r.json()["items"] crawls = r.json()["items"]
assert crawls[0]["id"] == crawler_crawl_id assert crawls[0]["id"] == crawler_crawl_id
assert crawls[0]["activeQAState"]
assert crawls[0]["activeQAStats"] assert crawls[0]["activeQAStats"]
assert crawls[0]["lastQAState"] assert crawls[0]["lastQAState"]
assert crawls[0]["lastQAStarted"]
# Ensure sorting by lastQAStarted works as expected - current floated to top
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls?sortBy=lastQAStarted",
headers=crawler_auth_headers,
)
assert r.status_code == 200
crawls = r.json()["items"]
assert crawls[0]["id"] == crawler_crawl_id
assert crawls[0]["activeQAStats"]
assert crawls[0]["lastQAState"]
assert crawls[0]["lastQAStarted"]
# Ensure sorting by lastQAState works as expected with all-crawls
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=lastQAStarted",
headers=crawler_auth_headers,
)
assert r.status_code == 200
crawls = r.json()["items"]
assert crawls[0]["id"] == crawler_crawl_id
assert crawls[0]["activeQAStats"]
assert crawls[0]["lastQAState"]
assert crawls[0]["lastQAStarted"]
# Cancel crawl # Cancel crawl
r = requests.post( r = requests.post(

View File

@ -419,9 +419,22 @@ def test_list_all_crawls(
assert item["finished"] assert item["finished"]
assert item["state"] assert item["state"]
# Test that all-crawls qaState sort always puts crawls before uploads # Test that all-crawls lastQAState and lastQAStarted sorts always puts crawls before uploads
r = requests.get( r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=qaState", f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=lastQAState",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
last_type = None
for item in data["items"]:
if last_type == "upload":
assert item["type"] != "crawl"
last_type = item["type"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=lastQAStarted",
headers=admin_auth_headers, headers=admin_auth_headers,
) )
assert r.status_code == 200 assert r.status_code == 200