Add collection page list/search endpoint (#2354)
Fixes #2353 Adds a new endpoint to list pages in a collection, with filtering available on `url` (exact match), `ts`, `urlPrefix`, `isSeed`, and `depth`, as well as accompanying tests. Additional sort options have been added as well. These same filters and sort options have also been added to the crawl pages endpoint. Also fixes an issue where `isSeed` wasn't being set in the database when false but only added on serialization, which was preventing filtering from working as expected.
This commit is contained in:
parent
001839a521
commit
98a45b0d85
@ -248,7 +248,14 @@ def main() -> None:
|
||||
upload_ops = init_uploads_api(*base_crawl_init)
|
||||
|
||||
page_ops = init_pages_api(
|
||||
app, mdb, crawls, org_ops, storage_ops, background_job_ops, current_active_user
|
||||
app,
|
||||
mdb,
|
||||
crawls,
|
||||
org_ops,
|
||||
storage_ops,
|
||||
background_job_ops,
|
||||
coll_ops,
|
||||
current_active_user,
|
||||
)
|
||||
|
||||
base_crawl_ops.set_page_ops(page_ops)
|
||||
|
@ -89,7 +89,9 @@ def init_ops() -> Tuple[
|
||||
|
||||
upload_ops = UploadOps(*base_crawl_init)
|
||||
|
||||
page_ops = PageOps(mdb, crawl_ops, org_ops, storage_ops, background_job_ops)
|
||||
page_ops = PageOps(
|
||||
mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops
|
||||
)
|
||||
|
||||
base_crawl_ops.set_page_ops(page_ops)
|
||||
crawl_ops.set_page_ops(page_ops)
|
||||
|
@ -1,8 +1,12 @@
|
||||
"""crawl pages"""
|
||||
|
||||
# pylint: disable=too-many-lines
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import re
|
||||
import traceback
|
||||
import urllib.parse
|
||||
from datetime import datetime
|
||||
from typing import TYPE_CHECKING, Optional, Tuple, List, Dict, Any, Union
|
||||
from uuid import UUID, uuid4
|
||||
@ -37,11 +41,12 @@ from .utils import str_to_date, str_list_to_bools, dt_now
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .background_jobs import BackgroundJobOps
|
||||
from .colls import CollectionOps
|
||||
from .crawls import CrawlOps
|
||||
from .orgs import OrgOps
|
||||
from .storages import StorageOps
|
||||
else:
|
||||
CrawlOps = StorageOps = OrgOps = BackgroundJobOps = object
|
||||
CrawlOps = StorageOps = OrgOps = BackgroundJobOps = CollectionOps = object
|
||||
|
||||
|
||||
# ============================================================================
|
||||
@ -53,14 +58,18 @@ class PageOps:
|
||||
org_ops: OrgOps
|
||||
storage_ops: StorageOps
|
||||
background_job_ops: BackgroundJobOps
|
||||
coll_ops: CollectionOps
|
||||
|
||||
def __init__(self, mdb, crawl_ops, org_ops, storage_ops, background_job_ops):
|
||||
def __init__(
|
||||
self, mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops
|
||||
):
|
||||
self.pages = mdb["pages"]
|
||||
self.crawls = mdb["crawls"]
|
||||
self.crawl_ops = crawl_ops
|
||||
self.org_ops = org_ops
|
||||
self.storage_ops = storage_ops
|
||||
self.background_job_ops = background_job_ops
|
||||
self.coll_ops = coll_ops
|
||||
|
||||
async def init_index(self):
|
||||
"""init index for pages db collection"""
|
||||
@ -82,6 +91,9 @@ class PageOps:
|
||||
if not page_dict.get("url"):
|
||||
continue
|
||||
|
||||
if not page_dict.get("isSeed"):
|
||||
page_dict["isSeed"] = False
|
||||
|
||||
if len(pages_buffer) > batch_size:
|
||||
await self._add_pages_to_db(crawl_id, pages_buffer)
|
||||
pages_buffer = []
|
||||
@ -210,9 +222,8 @@ class PageOps:
|
||||
):
|
||||
"""Add page to database"""
|
||||
page = self._get_page_from_dict(page_dict, crawl_id, oid)
|
||||
page_to_insert = page.to_dict(
|
||||
exclude_unset=True, exclude_none=True, exclude_defaults=True
|
||||
)
|
||||
|
||||
page_to_insert = page.to_dict(exclude_unset=True, exclude_none=True)
|
||||
|
||||
try:
|
||||
await self.pages.insert_one(page_to_insert)
|
||||
@ -492,6 +503,11 @@ class PageOps:
|
||||
self,
|
||||
crawl_id: str,
|
||||
org: Optional[Organization] = None,
|
||||
url: Optional[str] = None,
|
||||
url_prefix: Optional[str] = None,
|
||||
ts: Optional[datetime] = None,
|
||||
is_seed: Optional[bool] = None,
|
||||
depth: Optional[int] = None,
|
||||
qa_run_id: Optional[str] = None,
|
||||
qa_filter_by: Optional[str] = None,
|
||||
qa_gte: Optional[float] = None,
|
||||
@ -518,6 +534,23 @@ class PageOps:
|
||||
if org:
|
||||
query["oid"] = org.id
|
||||
|
||||
if url_prefix:
|
||||
url_prefix = urllib.parse.unquote(url_prefix)
|
||||
regex_pattern = f"^{re.escape(url_prefix)}"
|
||||
query["url"] = {"$regex": regex_pattern, "$options": "i"}
|
||||
|
||||
elif url:
|
||||
query["url"] = urllib.parse.unquote(url)
|
||||
|
||||
if ts:
|
||||
query["ts"] = ts
|
||||
|
||||
if is_seed in (True, False):
|
||||
query["isSeed"] = is_seed
|
||||
|
||||
if isinstance(depth, int):
|
||||
query["depth"] = depth
|
||||
|
||||
if reviewed:
|
||||
query["$or"] = [
|
||||
{"approved": {"$ne": None}},
|
||||
@ -562,7 +595,18 @@ class PageOps:
|
||||
# Sorting options to add:
|
||||
# - automated heuristics like screenshot_comparison (dict keyed by QA run id)
|
||||
# - Ensure notes sorting works okay with notes in list
|
||||
sort_fields = ("url", "title", "notes", "approved")
|
||||
sort_fields = (
|
||||
"url",
|
||||
"title",
|
||||
"notes",
|
||||
"approved",
|
||||
"ts",
|
||||
"status",
|
||||
"mime",
|
||||
"filename",
|
||||
"depth",
|
||||
"isSeed",
|
||||
)
|
||||
qa_sort_fields = ("screenshotMatch", "textMatch")
|
||||
if sort_by not in sort_fields and sort_by not in qa_sort_fields:
|
||||
raise HTTPException(status_code=400, detail="invalid_sort_by")
|
||||
@ -613,6 +657,101 @@ class PageOps:
|
||||
|
||||
return [PageOut.from_dict(data) for data in items], total
|
||||
|
||||
async def list_collection_pages(
|
||||
self,
|
||||
coll_id: UUID,
|
||||
org: Optional[Organization] = None,
|
||||
url: Optional[str] = None,
|
||||
url_prefix: Optional[str] = None,
|
||||
ts: Optional[datetime] = None,
|
||||
is_seed: Optional[bool] = None,
|
||||
depth: Optional[int] = None,
|
||||
page_size: int = DEFAULT_PAGE_SIZE,
|
||||
page: int = 1,
|
||||
sort_by: Optional[str] = None,
|
||||
sort_direction: Optional[int] = -1,
|
||||
) -> Tuple[Union[List[PageOut], List[PageOutWithSingleQA]], int]:
|
||||
"""List all pages in collection, with optional filtering"""
|
||||
# pylint: disable=duplicate-code, too-many-locals, too-many-branches, too-many-statements
|
||||
# Zero-index page for query
|
||||
page = page - 1
|
||||
skip = page_size * page
|
||||
|
||||
crawl_ids = await self.coll_ops.get_collection_crawl_ids(coll_id)
|
||||
|
||||
query: dict[str, object] = {
|
||||
"crawl_id": {"$in": crawl_ids},
|
||||
}
|
||||
if org:
|
||||
query["oid"] = org.id
|
||||
|
||||
if url_prefix:
|
||||
url_prefix = urllib.parse.unquote(url_prefix)
|
||||
regex_pattern = f"^{re.escape(url_prefix)}"
|
||||
query["url"] = {"$regex": regex_pattern, "$options": "i"}
|
||||
|
||||
elif url:
|
||||
query["url"] = urllib.parse.unquote(url)
|
||||
|
||||
if ts:
|
||||
query["ts"] = ts
|
||||
|
||||
if is_seed in (True, False):
|
||||
query["isSeed"] = is_seed
|
||||
|
||||
if isinstance(depth, int):
|
||||
query["depth"] = depth
|
||||
|
||||
aggregate = [{"$match": query}]
|
||||
|
||||
if sort_by:
|
||||
# Sorting options to add:
|
||||
# - automated heuristics like screenshot_comparison (dict keyed by QA run id)
|
||||
# - Ensure notes sorting works okay with notes in list
|
||||
sort_fields = (
|
||||
"url",
|
||||
"crawl_id",
|
||||
"ts",
|
||||
"status",
|
||||
"mime",
|
||||
"filename",
|
||||
"depth",
|
||||
"isSeed",
|
||||
)
|
||||
if sort_by not in sort_fields:
|
||||
raise HTTPException(status_code=400, detail="invalid_sort_by")
|
||||
if sort_direction not in (1, -1):
|
||||
raise HTTPException(status_code=400, detail="invalid_sort_direction")
|
||||
|
||||
aggregate.extend([{"$sort": {sort_by: sort_direction}}])
|
||||
|
||||
aggregate.extend(
|
||||
[
|
||||
{
|
||||
"$facet": {
|
||||
"items": [
|
||||
{"$skip": skip},
|
||||
{"$limit": page_size},
|
||||
],
|
||||
"total": [{"$count": "count"}],
|
||||
}
|
||||
},
|
||||
]
|
||||
)
|
||||
|
||||
# Get total
|
||||
cursor = self.pages.aggregate(aggregate)
|
||||
results = await cursor.to_list(length=1)
|
||||
result = results[0]
|
||||
items = result["items"]
|
||||
|
||||
try:
|
||||
total = int(result["total"][0]["count"])
|
||||
except (IndexError, ValueError):
|
||||
total = 0
|
||||
|
||||
return [PageOut.from_dict(data) for data in items], total
|
||||
|
||||
async def re_add_crawl_pages(self, crawl_id: str, oid: UUID):
|
||||
"""Delete existing pages for crawl and re-add from WACZs."""
|
||||
await self.delete_crawl_pages(crawl_id, oid)
|
||||
@ -738,13 +877,14 @@ class PageOps:
|
||||
# ============================================================================
|
||||
# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme
|
||||
def init_pages_api(
|
||||
app, mdb, crawl_ops, org_ops, storage_ops, background_job_ops, user_dep
|
||||
app, mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops, user_dep
|
||||
):
|
||||
"""init pages API"""
|
||||
# pylint: disable=invalid-name
|
||||
|
||||
ops = PageOps(mdb, crawl_ops, org_ops, storage_ops, background_job_ops)
|
||||
ops = PageOps(mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops)
|
||||
|
||||
org_viewer_dep = org_ops.org_viewer_dep
|
||||
org_crawl_dep = org_ops.org_crawl_dep
|
||||
|
||||
@app.post(
|
||||
@ -913,9 +1053,14 @@ def init_pages_api(
|
||||
tags=["pages", "all-crawls"],
|
||||
response_model=PaginatedPageOutResponse,
|
||||
)
|
||||
async def get_pages_list(
|
||||
async def get_crawl_pages_list(
|
||||
crawl_id: str,
|
||||
org: Organization = Depends(org_crawl_dep),
|
||||
url: Optional[str] = None,
|
||||
urlPrefix: Optional[str] = None,
|
||||
ts: Optional[datetime] = None,
|
||||
isSeed: Optional[bool] = None,
|
||||
depth: Optional[int] = None,
|
||||
reviewed: Optional[bool] = None,
|
||||
approved: Optional[str] = None,
|
||||
hasNotes: Optional[bool] = None,
|
||||
@ -932,6 +1077,11 @@ def init_pages_api(
|
||||
pages, total = await ops.list_pages(
|
||||
crawl_id=crawl_id,
|
||||
org=org,
|
||||
url=url,
|
||||
url_prefix=urlPrefix,
|
||||
ts=ts,
|
||||
is_seed=isSeed,
|
||||
depth=depth,
|
||||
reviewed=reviewed,
|
||||
approved=formatted_approved,
|
||||
has_notes=hasNotes,
|
||||
@ -942,6 +1092,40 @@ def init_pages_api(
|
||||
)
|
||||
return paginated_format(pages, total, page, pageSize)
|
||||
|
||||
@app.get(
|
||||
"/orgs/{oid}/collections/{coll_id}/pages",
|
||||
tags=["pages", "collections"],
|
||||
response_model=PaginatedPageOutResponse,
|
||||
)
|
||||
async def get_collection_pages_list(
|
||||
coll_id: UUID,
|
||||
org: Organization = Depends(org_viewer_dep),
|
||||
url: Optional[str] = None,
|
||||
urlPrefix: Optional[str] = None,
|
||||
ts: Optional[datetime] = None,
|
||||
isSeed: Optional[bool] = None,
|
||||
depth: Optional[int] = None,
|
||||
pageSize: int = DEFAULT_PAGE_SIZE,
|
||||
page: int = 1,
|
||||
sortBy: Optional[str] = None,
|
||||
sortDirection: Optional[int] = -1,
|
||||
):
|
||||
"""Retrieve paginated list of pages in collection"""
|
||||
pages, total = await ops.list_collection_pages(
|
||||
coll_id=coll_id,
|
||||
org=org,
|
||||
url=url,
|
||||
url_prefix=urlPrefix,
|
||||
ts=ts,
|
||||
is_seed=isSeed,
|
||||
depth=depth,
|
||||
page_size=pageSize,
|
||||
page=page,
|
||||
sort_by=sortBy,
|
||||
sort_direction=sortDirection,
|
||||
)
|
||||
return paginated_format(pages, total, page, pageSize)
|
||||
|
||||
@app.get(
|
||||
"/orgs/{oid}/crawls/{crawl_id}/qa/{qa_run_id}/pages",
|
||||
tags=["pages", "qa"],
|
||||
|
@ -232,7 +232,7 @@ def crawler_crawl_id(crawler_auth_headers, default_org_id):
|
||||
"name": "Crawler User Test Crawl",
|
||||
"description": "crawler test crawl",
|
||||
"tags": ["wr-test-2"],
|
||||
"config": {"seeds": [{"url": "https://webrecorder.net/"}], "limit": 1},
|
||||
"config": {"seeds": [{"url": "https://webrecorder.net/"}], "limit": 3},
|
||||
"crawlerChannel": "test",
|
||||
}
|
||||
r = requests.post(
|
||||
|
@ -582,6 +582,121 @@ def test_list_collections(
|
||||
assert second_coll["dateLatest"]
|
||||
|
||||
|
||||
def test_list_pages_in_collection(crawler_auth_headers, default_org_id):
|
||||
# Test list endpoint
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages",
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
|
||||
assert data["total"] >= 0
|
||||
|
||||
pages = data["items"]
|
||||
assert pages
|
||||
|
||||
for page in pages:
|
||||
assert page["id"]
|
||||
assert page["oid"]
|
||||
assert page["crawl_id"]
|
||||
assert page["url"]
|
||||
assert page["ts"]
|
||||
assert page.get("title") or page.get("title") is None
|
||||
assert page.get("loadState") or page.get("loadState") is None
|
||||
assert page.get("status") or page.get("status") is None
|
||||
assert page.get("mime") or page.get("mime") is None
|
||||
assert page["isError"] in (None, True, False)
|
||||
assert page["isFile"] in (None, True, False)
|
||||
|
||||
# Save info for page to test url and urlPrefix filters
|
||||
coll_page = pages[0]
|
||||
coll_page_id = coll_page["id"]
|
||||
coll_page_url = coll_page["url"]
|
||||
coll_page_ts = coll_page["ts"]
|
||||
|
||||
# Test exact url filter
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?url={coll_page_url}",
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
|
||||
assert data["total"] >= 1
|
||||
for matching_page in data["items"]:
|
||||
assert matching_page["url"] == coll_page_url
|
||||
|
||||
# Test exact url and ts filters together
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?url={coll_page_url}&ts={coll_page_ts}",
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
|
||||
assert data["total"] >= 1
|
||||
for matching_page in data["items"]:
|
||||
assert matching_page["url"] == coll_page_url
|
||||
assert matching_page["ts"] == coll_page_ts
|
||||
|
||||
# Test urlPrefix filter
|
||||
url_prefix = coll_page_url[:8]
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?urlPrefix={url_prefix}",
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
|
||||
assert data["total"] >= 1
|
||||
|
||||
found_matching_page = False
|
||||
for page in data["items"]:
|
||||
if page["id"] == coll_page_id and page["url"] == coll_page_url:
|
||||
found_matching_page = True
|
||||
|
||||
assert found_matching_page
|
||||
|
||||
# Test isSeed filter
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?isSeed=true",
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
for page in data["items"]:
|
||||
assert page["isSeed"]
|
||||
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?isSeed=false",
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
for page in data["items"]:
|
||||
assert page["isSeed"] is False
|
||||
|
||||
# Test depth filter
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?depth=0",
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
for page in data["items"]:
|
||||
assert page["depth"] == 0
|
||||
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?depth=1",
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
for page in data["items"]:
|
||||
assert page["depth"] == 1
|
||||
|
||||
|
||||
def test_remove_upload_from_collection(crawler_auth_headers, default_org_id):
|
||||
# Remove upload
|
||||
r = requests.post(
|
||||
|
@ -658,7 +658,8 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
assert data["total"] >= 0
|
||||
|
||||
assert data["total"] == 3
|
||||
|
||||
pages = data["items"]
|
||||
assert pages
|
||||
@ -682,7 +683,11 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
|
||||
|
||||
# Test GET page endpoint
|
||||
global page_id
|
||||
page_id = pages[0]["id"]
|
||||
test_page = pages[0]
|
||||
page_id = test_page["id"]
|
||||
test_page_url = test_page["url"]
|
||||
test_page_ts = test_page["ts"]
|
||||
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}",
|
||||
headers=crawler_auth_headers,
|
||||
@ -710,13 +715,100 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
|
||||
assert page.get("modified") is None
|
||||
assert page.get("approved") is None
|
||||
|
||||
# Test exact url filter
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?url={test_page_url}",
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
|
||||
assert data["total"] >= 1
|
||||
for matching_page in data["items"]:
|
||||
assert matching_page["url"] == test_page_url
|
||||
|
||||
# Test exact url and ts filters together
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?url={test_page_url}&ts={test_page_ts}",
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
|
||||
assert data["total"] >= 1
|
||||
for matching_page in data["items"]:
|
||||
assert matching_page["url"] == test_page_url
|
||||
assert matching_page["ts"] == test_page_ts
|
||||
|
||||
# Test urlPrefix filter
|
||||
url_prefix = test_page_url[:8]
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?urlPrefix={url_prefix}",
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
|
||||
assert data["total"] >= 1
|
||||
|
||||
found_matching_page = False
|
||||
for page in data["items"]:
|
||||
if page["id"] == page_id and page["url"] == test_page_url:
|
||||
found_matching_page = True
|
||||
|
||||
assert found_matching_page
|
||||
|
||||
# Test isSeed filter
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?isSeed=True",
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
assert data["total"] == 1
|
||||
for page in data["items"]:
|
||||
assert page["isSeed"]
|
||||
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?isSeed=False",
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
assert data["total"] == 2
|
||||
for page in data["items"]:
|
||||
assert page["isSeed"] is False
|
||||
|
||||
# Test depth filter
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?depth=0",
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
assert data["total"] == 1
|
||||
for page in data["items"]:
|
||||
assert page["depth"] == 0
|
||||
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?depth=1",
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
assert data["total"] == 2
|
||||
for page in data["items"]:
|
||||
assert page["depth"] == 1
|
||||
|
||||
|
||||
def test_crawl_pages_qa_filters(crawler_auth_headers, default_org_id, crawler_crawl_id):
|
||||
# Test reviewed filter (page has no notes or approved so should show up in false)
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?reviewed=False",
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
assert r.json()["total"] == 1
|
||||
assert r.json()["total"] == 3
|
||||
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?reviewed=True",
|
||||
@ -770,15 +862,15 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
assert r.json()["total"] == 0
|
||||
assert r.json()["total"] == 2
|
||||
|
||||
# Test reviewed filter (page now approved so should show up in True)
|
||||
# Test reviewed filter (page now approved so should show up in True, other pages show here)
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?reviewed=False",
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
assert r.json()["total"] == 0
|
||||
assert r.json()["total"] == 2
|
||||
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?reviewed=True",
|
||||
@ -853,7 +945,7 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
assert r.json()["total"] == 0
|
||||
assert r.json()["total"] == 2
|
||||
|
||||
|
||||
def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
|
||||
@ -985,14 +1077,14 @@ def test_crawl_page_notes(crawler_auth_headers, default_org_id, crawler_crawl_id
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
assert r.json()["total"] == 1
|
||||
assert r.json()["total"] == 3
|
||||
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?approved=true,false,none",
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
assert r.json()["total"] == 1
|
||||
assert r.json()["total"] == 3
|
||||
|
||||
# Test reviewed filter (page now has notes so should show up in True)
|
||||
r = requests.get(
|
||||
@ -1000,7 +1092,7 @@ def test_crawl_page_notes(crawler_auth_headers, default_org_id, crawler_crawl_id
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
assert r.json()["total"] == 0
|
||||
assert r.json()["total"] == 2
|
||||
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?reviewed=True",
|
||||
@ -1015,7 +1107,7 @@ def test_crawl_page_notes(crawler_auth_headers, default_org_id, crawler_crawl_id
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
assert r.json()["total"] == 0
|
||||
assert r.json()["total"] == 2
|
||||
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages?hasNotes=True",
|
||||
|
Loading…
Reference in New Issue
Block a user