diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 7e41ddb9..9690ebe5 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -33,7 +33,7 @@ from .models import ( CrawlerChannel, CrawlerChannels, ) -from .utils import dt_now +from .utils import dt_now, slug_from_name if TYPE_CHECKING: from .orgs import OrgOps @@ -232,6 +232,7 @@ class CrawlConfigOps: run_now=run_now, out_filename=out_filename, profile_filename=profile_filename or "", + warc_prefix=self.get_warc_prefix(org, crawlconfig), ) if crawl_id and run_now: @@ -298,6 +299,7 @@ class CrawlConfigOps: run_now=False, out_filename=self.default_filename_template, profile_filename=profile_filename or "", + warc_prefix=self.get_warc_prefix(org, crawlconfig), ) async def update_crawl_config( @@ -841,7 +843,10 @@ class CrawlConfigOps: try: crawl_id = await self.crawl_manager.create_crawl_job( - crawlconfig, org.storage, userid=str(user.id) + crawlconfig, + org.storage, + userid=str(user.id), + warc_prefix=self.get_warc_prefix(org, crawlconfig), ) await self.add_new_crawl(crawl_id, crawlconfig, user, manual=True) return crawl_id @@ -897,6 +902,21 @@ class CrawlConfigOps: """Get crawler image name by id""" return self.crawler_images_map.get(crawler_channel or "") + def get_warc_prefix(self, org: Organization, crawlconfig: CrawlConfig) -> str: + """Generate WARC prefix slug from org slug, name or url + if no name is provided, hostname is used from url, otherwise + url is ignored""" + name = crawlconfig.name + if not name: + if crawlconfig.config.seeds and len(crawlconfig.config.seeds): + url = crawlconfig.config.seeds[0].url + parts = urllib.parse.urlsplit(url) + name = parts.netloc + + name = slug_from_name(name or "") + prefix = org.slug + "-" + name + return prefix[:80] + # ============================================================================ # pylint: disable=too-many-locals diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index 90de4201..a8fcfd83 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -119,6 +119,7 @@ class CrawlManager(K8sAPI): run_now: bool, out_filename: str, profile_filename: str, + warc_prefix: str, ) -> Optional[str]: """add new crawl, store crawl config in configmap""" @@ -139,7 +140,10 @@ class CrawlManager(K8sAPI): if run_now: crawl_id = await self.create_crawl_job( - crawlconfig, storage, str(crawlconfig.modifiedBy) + crawlconfig, + storage, + str(crawlconfig.modifiedBy), + warc_prefix, ) await self._update_scheduled_job(crawlconfig) @@ -151,6 +155,7 @@ class CrawlManager(K8sAPI): crawlconfig: CrawlConfig, storage: StorageRef, userid: str, + warc_prefix: str, ) -> str: """create new crawl job from config""" cid = str(crawlconfig.id) @@ -169,6 +174,7 @@ class CrawlManager(K8sAPI): crawlconfig.crawlTimeout, crawlconfig.maxCrawlSize, manual=True, + warc_prefix=warc_prefix, ) async def update_crawl_config( diff --git a/backend/btrixcloud/k8sapi.py b/backend/btrixcloud/k8sapi.py index 766756b8..73663934 100644 --- a/backend/btrixcloud/k8sapi.py +++ b/backend/btrixcloud/k8sapi.py @@ -86,6 +86,7 @@ class K8sAPI: max_crawl_size=0, manual=True, crawl_id=None, + warc_prefix="", ): """load job template from yaml""" if not crawl_id: @@ -104,6 +105,7 @@ class K8sAPI: "storage_name": str(storage), "manual": "1" if manual else "0", "crawler_channel": crawler_channel, + "warc_prefix": warc_prefix, } data = self.templates.env.get_template("crawl_job.yaml").render(params) diff --git a/backend/btrixcloud/operator.py b/backend/btrixcloud/operator.py index 382b2d5d..e9109e48 100644 --- a/backend/btrixcloud/operator.py +++ b/backend/btrixcloud/operator.py @@ -538,6 +538,8 @@ class BtrixOperator(K8sAPI): params["storage_filename"] = configmap["STORE_FILENAME"] params["restart_time"] = spec.get("restartTime") + params["warc_prefix"] = spec.get("warcPrefix") + params["redis_url"] = redis_url if spec.get("restartTime") != status.restartTime: @@ -1651,26 +1653,10 @@ class BtrixOperator(K8sAPI): org = await self.org_ops.get_org_by_id(UUID(oid)) - crawl_id, crawljob = self.new_crawl_job_yaml( - cid, - userid=userid, - oid=oid, - storage=org.storage, - crawler_channel=configmap.get("CRAWLER_CHANNEL", "default"), - scale=int(configmap.get("INITIAL_SCALE", 1)), - crawl_timeout=int(configmap.get("CRAWL_TIMEOUT", 0)), - max_crawl_size=int(configmap.get("MAX_CRAWL_SIZE", "0")), - manual=False, - crawl_id=crawl_id, - ) - - attachments = list(yaml.safe_load_all(crawljob)) - - if crawl_id in crawljobs: - attachments[0]["status"] = crawljobs[CJS][crawl_id]["status"] + warc_prefix = None if not actual_state: - # pylint: disable=duplicate-code + # cronjob doesn't exist yet crawlconfig = await self.crawl_config_ops.get_crawl_config( UUID(cid), UUID(oid) ) @@ -1686,11 +1672,35 @@ class BtrixOperator(K8sAPI): print(f"error: missing user for id {userid}") return {"attachments": []} + warc_prefix = self.crawl_config_ops.get_warc_prefix(org, crawlconfig) + await self.crawl_config_ops.add_new_crawl( - crawl_id, crawlconfig, user, manual=False + crawl_id, + crawlconfig, + user, + manual=False, ) print("Scheduled Crawl Created: " + crawl_id) + crawl_id, crawljob = self.new_crawl_job_yaml( + cid, + userid=userid, + oid=oid, + storage=org.storage, + crawler_channel=configmap.get("CRAWLER_CHANNEL", "default"), + scale=int(configmap.get("INITIAL_SCALE", 1)), + crawl_timeout=int(configmap.get("CRAWL_TIMEOUT", 0)), + max_crawl_size=int(configmap.get("MAX_CRAWL_SIZE", "0")), + manual=False, + crawl_id=crawl_id, + warc_prefix=warc_prefix, + ) + + attachments = list(yaml.safe_load_all(crawljob)) + + if crawl_id in crawljobs: + attachments[0]["status"] = crawljobs[CJS][crawl_id]["status"] + return { "attachments": attachments, } diff --git a/chart/app-templates/crawl_job.yaml b/chart/app-templates/crawl_job.yaml index 84fad5ef..96fec0fc 100644 --- a/chart/app-templates/crawl_job.yaml +++ b/chart/app-templates/crawl_job.yaml @@ -24,6 +24,7 @@ spec: manual: {{ manual }} crawlerChannel: "{{ crawler_channel }}" ttlSecondsAfterFinished: 30 + warcPrefix: "{{ warc_prefix }}" storageName: "{{ storage_name }}" diff --git a/chart/app-templates/crawler.yaml b/chart/app-templates/crawler.yaml index 67d8c588..b3e7a0c9 100644 --- a/chart/app-templates/crawler.yaml +++ b/chart/app-templates/crawler.yaml @@ -149,6 +149,9 @@ spec: - name: STORE_USER value: "{{ userid }}" + - name: WARC_PREFIX + value: "{{ warc_prefix }}" + {% if crawler_socks_proxy_host %} - name: SOCKS_HOST value: "{{ crawler_socks_proxy_host }}"