Fixes #2406 Converts migration 0042 to launch a background job (parallelized across several pods) to migrate all crawls by optimizing their pages and setting `version: 2` on the crawl when complete. Also Optimizes MongoDB queries for better performance. Migration Improvements: - Add `isMigrating` and `version` fields to `BaseCrawl` - Add new background job type to use in migration with accompanying `migration_job.yaml` template that allows for parallelization - Add new API endpoint to launch this crawl migration job, and ensure that we have list and retry endpoints for superusers that work with background jobs that aren't tied to a specific org - Rework background job models and methods now that not all background jobs are tied to a single org - Ensure new crawls and uploads have `version` set to `2` - Modify crawl and collection replay.json endpoints to only include fields for replay optimization (`initialPages`, `pageQueryUrl`, `preloadResources`) if all relevant crawls/uploads have `version` set to `2` - Remove `distinct` calls from migration pathways - Consolidate collection recompute stats Query Optimizations: - Remove all uses of $group and $facet - Optimize /replay.json endpoints to precompute preload_resources, avoid fetching crawl list twice - Optimize /collections endpoint by not fetching resources - Rename /urls -> /pageUrlCounts and avoid $group, instead sort with index, either by seed + ts or by url to get top matches. - Use $gte instead of $regex to get prefix matches on URL - Use $text instead of $regex to get text search on title - Remove total from /pages and /pageUrlCounts queries by not using $facet - frontend: only call /pageUrlCounts when dialog is opened. --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com> Co-authored-by: Emma Segal-Grossman <hi@emma.cafe> Co-authored-by: Ilya Kreymer <ikreymer@users.noreply.github.com>
95 lines
2.8 KiB
Python
95 lines
2.8 KiB
Python
"""entrypoint module for background jobs"""
|
|
|
|
import asyncio
|
|
import os
|
|
import sys
|
|
import traceback
|
|
from uuid import UUID
|
|
|
|
from .models import BgJobType
|
|
from .ops import init_ops
|
|
|
|
|
|
job_type = os.environ.get("BG_JOB_TYPE")
|
|
oid = os.environ.get("OID")
|
|
crawl_type = os.environ.get("CRAWL_TYPE")
|
|
crawl_id = os.environ.get("CRAWL_ID")
|
|
|
|
|
|
# ============================================================================
|
|
# pylint: disable=too-many-function-args, duplicate-code, too-many-locals, too-many-return-statements
|
|
# pylint: disable=too-many-branches
|
|
async def main():
|
|
"""run background job with access to ops classes"""
|
|
|
|
# pylint: disable=import-outside-toplevel
|
|
if not os.environ.get("KUBERNETES_SERVICE_HOST"):
|
|
print(
|
|
"Sorry, the Browsertrix Backend must be run inside a Kubernetes environment.\
|
|
Kubernetes not detected (KUBERNETES_SERVICE_HOST is not set), Exiting"
|
|
)
|
|
return 1
|
|
|
|
(org_ops, _, _, _, _, page_ops, coll_ops, _, _, _, _, user_manager) = init_ops()
|
|
|
|
# Run job (generic)
|
|
if job_type == BgJobType.OPTIMIZE_PAGES:
|
|
try:
|
|
await page_ops.optimize_crawl_pages(version=2)
|
|
return 0
|
|
# pylint: disable=broad-exception-caught
|
|
except Exception:
|
|
traceback.print_exc()
|
|
return 1
|
|
|
|
# Run job (org-specific)
|
|
if not oid:
|
|
print("Org id missing, quitting")
|
|
return 1
|
|
|
|
org = await org_ops.get_org_by_id(UUID(oid))
|
|
if not org:
|
|
print("Org id invalid, quitting")
|
|
return 1
|
|
|
|
if job_type == BgJobType.DELETE_ORG:
|
|
try:
|
|
await org_ops.delete_org_and_data(org, user_manager)
|
|
return 0
|
|
# pylint: disable=broad-exception-caught
|
|
except Exception:
|
|
traceback.print_exc()
|
|
return 1
|
|
|
|
if job_type == BgJobType.RECALCULATE_ORG_STATS:
|
|
try:
|
|
await org_ops.recalculate_storage(org)
|
|
return 0
|
|
# pylint: disable=broad-exception-caught
|
|
except Exception:
|
|
traceback.print_exc()
|
|
return 1
|
|
|
|
if job_type == BgJobType.READD_ORG_PAGES:
|
|
try:
|
|
if not crawl_id:
|
|
await page_ops.re_add_all_crawl_pages(org, crawl_type=crawl_type)
|
|
else:
|
|
await page_ops.re_add_crawl_pages(crawl_id=crawl_id, oid=org.id)
|
|
|
|
await coll_ops.recalculate_org_collection_stats(org)
|
|
return 0
|
|
# pylint: disable=broad-exception-caught
|
|
except Exception:
|
|
traceback.print_exc()
|
|
return 1
|
|
|
|
print(f"Provided job type {job_type} not currently supported")
|
|
return 1
|
|
|
|
|
|
# # ============================================================================
|
|
if __name__ == "__main__":
|
|
return_code = asyncio.run(main())
|
|
sys.exit(return_code)
|