browsertrix/backend/btrixcloud/main_scheduled_job.py
Ilya Kreymer 2da6c1c905
1.6.3 Fixes - Fix workflow sort order for Latest Crawl + 'Remove From Collection' action menu on archived items in collections (#1113)
* fix latest crawl (lastRun) sort:
- don't cast 'started' value to string when setting as starting crawl time (regression from #937)
- caused incorrect sorting as finished crawl time was a datetime, while starting crawl time was a string
- move updated config crawl info in one place, simplify to avoid returning started time altogether, just set directly
- pass mdb crawlconfigs and crawls collections directly to add_new_crawl() function
- fixes #1108

* Add dropdown menu containing 'Remove from Collection' to archived items in collection view (#1110)
- Enables users to remove an item from a collection from the collection detail view - menu was previously missing
- Fixes: #1102 (missing dropdown menu) by making use of the inactive menu trigger button.
- Updates collection items page size to match "Archived Items" page size (20 items per page)

---------
Co-authored-by: sua yoo <sua@webrecorder.org>
2023-08-25 21:08:47 -07:00

78 lines
2.0 KiB
Python

""" entrypoint for cron crawl job"""
import asyncio
import os
import uuid
from .k8sapi import K8sAPI
from .db import init_db
from .crawlconfigs import (
get_crawl_config,
inc_crawl_count,
)
from .crawls import add_new_crawl
from .utils import register_exit_handler
# ============================================================================
class ScheduledJob(K8sAPI):
"""Schedulued Job APIs for starting CrawlJobs on schedule"""
def __init__(self):
super().__init__()
self.cid = os.environ["CID"]
_, mdb = init_db()
self.crawls = mdb["crawls"]
self.crawlconfigs = mdb["crawl_configs"]
async def run(self):
"""run crawl!"""
register_exit_handler()
config_map = await self.core_api.read_namespaced_config_map(
name=f"crawl-config-{self.cid}", namespace=self.namespace
)
data = config_map.data
userid = data["USER_ID"]
scale = int(data.get("INITIAL_SCALE", 0))
try:
crawl_timeout = int(data.get("CRAWL_TIMEOUT", 0))
# pylint: disable=bare-except
except:
crawl_timeout = 0
oid = data["ORG_ID"]
crawlconfig = await get_crawl_config(self.crawlconfigs, uuid.UUID(self.cid))
# k8s create
crawl_id = await self.new_crawl_job(
self.cid, userid, oid, scale, crawl_timeout, manual=False
)
# db create
await inc_crawl_count(self.crawlconfigs, crawlconfig.id)
await add_new_crawl(
self.crawls,
self.crawlconfigs,
crawl_id,
crawlconfig,
uuid.UUID(userid),
manual=False,
)
print("Crawl Created: " + crawl_id)
# ============================================================================
def main():
"""main entrypoint"""
job = ScheduledJob()
loop = asyncio.get_event_loop()
loop.run_until_complete(job.run())
if __name__ == "__main__":
main()