Support sorting by last QA started time (#1712)

To support #1683, it would be useful to be able to sort by 'last QA
start time' in addition to/instead of last QA state.
- make sorting consistent with workflow sorting
- sortBy fields renamed to lastQAState and lastQAStarted
- Current QA runs are now included in the lastQAState/lastQAStarted fields, rather than being separated out to different values

---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
Ilya Kreymer 2024-04-22 13:00:52 -07:00 committed by GitHub
parent b574f00d2b
commit 1844e761dc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 89 additions and 32 deletions

View File

@ -551,8 +551,6 @@ class BaseCrawlOps:
{"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}},
{"$set": {"firstSeed": "$firstSeedObject.url"}},
{"$unset": ["firstSeedObject", "errors", "config"]},
{"$set": {"qaState": "$qa.state"}},
{"$set": {"activeQAState": "$qaState"}},
{"$set": {"activeQAStats": "$qa.stats"}},
{
"$set": {
@ -564,11 +562,23 @@ class BaseCrawlOps:
}
}
},
# Add active QA run to array if exists prior to sorting, taking care not to
# pass null to $concatArrays so that our result isn't null
{
"$set": {
"qaActiveArray": {"$cond": [{"$ne": ["$qa", None]}, ["$qa"], []]}
}
},
{
"$set": {
"qaArray": {"$concatArrays": ["$qaFinishedArray", "$qaActiveArray"]}
}
},
{
"$set": {
"sortedQARuns": {
"$sortArray": {
"input": "$qaFinishedArray",
"input": "$qaArray",
"sortBy": {"started": -1},
}
}
@ -576,13 +586,14 @@ class BaseCrawlOps:
},
{"$set": {"lastQARun": {"$arrayElemAt": ["$sortedQARuns", 0]}}},
{"$set": {"lastQAState": "$lastQARun.state"}},
{"$set": {"lastQAStarted": "$lastQARun.started"}},
{
"$set": {
"qaRunCount": {
"$size": {
"$cond": [
{"$isArray": "$qaFinishedArray"},
"$qaFinishedArray",
{"$isArray": "$qaArray"},
"$qaArray",
[],
]
}
@ -592,7 +603,9 @@ class BaseCrawlOps:
{
"$unset": [
"lastQARun",
"qaActiveArray",
"qaFinishedArray",
"qaArray",
"sortedQARuns",
]
},
@ -619,8 +632,9 @@ class BaseCrawlOps:
"finished",
"fileSize",
"reviewStatus",
"lastQAStarted",
"lastQAState",
"qaRunCount",
"qaState",
):
raise HTTPException(status_code=400, detail="invalid_sort_by")
if sort_direction not in (1, -1):
@ -628,10 +642,8 @@ class BaseCrawlOps:
sort_query = {sort_by: sort_direction}
# Secondary sort for qaState - sorted by current, then last
# Tertiary sort for qaState - type, always ascending so crawls are first
if sort_by == "qaState":
sort_query["lastQAState"] = sort_direction
# Ensure crawls are always sorted first for QA-related sorts
if sort_by in ("lastQAStarted", "lastQAState"):
sort_query["type"] = 1
aggregate.extend([{"$sort": sort_query}])

View File

@ -169,8 +169,6 @@ class CrawlOps(BaseCrawlOps):
{"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}},
{"$set": {"firstSeed": "$firstSeedObject.url"}},
{"$unset": ["firstSeedObject", "errors", "config"]},
{"$set": {"qaState": "$qa.state"}},
{"$set": {"activeQAState": "$qaState"}},
{"$set": {"activeQAStats": "$qa.stats"}},
{
"$set": {
@ -182,11 +180,23 @@ class CrawlOps(BaseCrawlOps):
}
}
},
# Add active QA run to array if exists prior to sorting, taking care not to
# pass null to $concatArrays so that our result isn't null
{
"$set": {
"qaActiveArray": {"$cond": [{"$ne": ["$qa", None]}, ["$qa"], []]}
}
},
{
"$set": {
"qaArray": {"$concatArrays": ["$qaFinishedArray", "$qaActiveArray"]}
}
},
{
"$set": {
"sortedQARuns": {
"$sortArray": {
"input": "$qaFinishedArray",
"input": "$qaArray",
"sortBy": {"started": -1},
}
}
@ -194,13 +204,14 @@ class CrawlOps(BaseCrawlOps):
},
{"$set": {"lastQARun": {"$arrayElemAt": ["$sortedQARuns", 0]}}},
{"$set": {"lastQAState": "$lastQARun.state"}},
{"$set": {"lastQAStarted": "$lastQARun.started"}},
{
"$set": {
"qaRunCount": {
"$size": {
"$cond": [
{"$isArray": "$qaFinishedArray"},
"$qaFinishedArray",
{"$isArray": "$qaArray"},
"$qaArray",
[],
]
}
@ -210,7 +221,9 @@ class CrawlOps(BaseCrawlOps):
{
"$unset": [
"lastQARun",
"qaActiveArray",
"qaFinishedArray",
"qaArray",
"sortedQARuns",
]
},
@ -239,19 +252,14 @@ class CrawlOps(BaseCrawlOps):
"firstSeed",
"reviewStatus",
"qaRunCount",
"qaState",
"lastQAState",
"lastQAStarted",
):
raise HTTPException(status_code=400, detail="invalid_sort_by")
if sort_direction not in (1, -1):
raise HTTPException(status_code=400, detail="invalid_sort_direction")
sort_query = {sort_by: sort_direction}
# Add secondary sort for qaState - sorted by current, then last
if sort_by == "qaState":
sort_query["lastQAState"] = sort_direction
aggregate.extend([{"$sort": sort_query}])
aggregate.extend([{"$sort": {sort_by: sort_direction}}])
aggregate.extend(
[

View File

@ -660,9 +660,9 @@ class CrawlOut(BaseMongoModel):
reviewStatus: Optional[conint(ge=1, le=5)] = None # type: ignore
qaRunCount: int = 0
activeQAState: Optional[str]
activeQAStats: Optional[CrawlStats]
lastQAState: Optional[str]
lastQAStarted: Optional[datetime]
# ============================================================================

View File

@ -116,29 +116,53 @@ def failed_qa_run_id(crawler_crawl_id, crawler_auth_headers, default_org_id):
assert qa["started"]
assert not qa["finished"]
# Ensure sorting by qaState works as expected - current floated to top
# Ensure sorting by lastQAState works as expected - current floated to top
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls?sortBy=qaState",
f"{API_PREFIX}/orgs/{default_org_id}/crawls?sortBy=lastQAState",
headers=crawler_auth_headers,
)
assert r.status_code == 200
crawls = r.json()["items"]
assert crawls[0]["id"] == crawler_crawl_id
assert crawls[0]["activeQAState"]
assert crawls[0]["activeQAStats"]
assert crawls[0]["lastQAState"]
assert crawls[0]["lastQAStarted"]
# Ensure sorting by qaState works as expected with all-crawls
# Ensure sorting by lastQAState works as expected with all-crawls
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=qaState",
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=lastQAState",
headers=crawler_auth_headers,
)
assert r.status_code == 200
crawls = r.json()["items"]
assert crawls[0]["id"] == crawler_crawl_id
assert crawls[0]["activeQAState"]
assert crawls[0]["activeQAStats"]
assert crawls[0]["lastQAState"]
assert crawls[0]["lastQAStarted"]
# Ensure sorting by lastQAStarted works as expected - current floated to top
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls?sortBy=lastQAStarted",
headers=crawler_auth_headers,
)
assert r.status_code == 200
crawls = r.json()["items"]
assert crawls[0]["id"] == crawler_crawl_id
assert crawls[0]["activeQAStats"]
assert crawls[0]["lastQAState"]
assert crawls[0]["lastQAStarted"]
# Ensure sorting by lastQAState works as expected with all-crawls
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=lastQAStarted",
headers=crawler_auth_headers,
)
assert r.status_code == 200
crawls = r.json()["items"]
assert crawls[0]["id"] == crawler_crawl_id
assert crawls[0]["activeQAStats"]
assert crawls[0]["lastQAState"]
assert crawls[0]["lastQAStarted"]
# Cancel crawl
r = requests.post(

View File

@ -419,9 +419,22 @@ def test_list_all_crawls(
assert item["finished"]
assert item["state"]
# Test that all-crawls qaState sort always puts crawls before uploads
# Test that all-crawls lastQAState and lastQAStarted sorts always puts crawls before uploads
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=qaState",
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=lastQAState",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
last_type = None
for item in data["items"]:
if last_type == "upload":
assert item["type"] != "crawl"
last_type = item["type"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=lastQAStarted",
headers=admin_auth_headers,
)
assert r.status_code == 200