More friendly WARC prefix inside WACZ based on Org slug + Crawl Name / First Seed URL. (#1537)

Supports setting WARC prefix for WARCs inside WACZ to `<org slug>-<slug
[crawl name | first seed host]>`.
- Prefix set via WARC_PREFIX env var, supported in browsertrix-crawler
1.0.0-beta.4 or higher
If crawl name is provided, uses crawl name, other hostname of first
seed. The name is 'sluggified', using lowercase alphanum characters
separated by dashes.

Ex: in an organization called `Default Org`, a crawl of
`https://specs.webrecorder.net/` and no name will have WARCs named:
`default-org-specs-webrecorder-net-....warc.gz`
If the crawl is given the name `SPECS`, the WARCs will be named
`default-org-specs-manual-....warc.gz`

Fixes #412 in a default way.
This commit is contained in:
Ilya Kreymer 2024-02-22 14:00:19 -08:00 committed by Ilya Kreymer
parent ba18abc063
commit 8ae032ff88
6 changed files with 64 additions and 22 deletions

View File

@ -33,7 +33,7 @@ from .models import (
CrawlerChannel,
CrawlerChannels,
)
from .utils import dt_now
from .utils import dt_now, slug_from_name
if TYPE_CHECKING:
from .orgs import OrgOps
@ -232,6 +232,7 @@ class CrawlConfigOps:
run_now=run_now,
out_filename=out_filename,
profile_filename=profile_filename or "",
warc_prefix=self.get_warc_prefix(org, crawlconfig),
)
if crawl_id and run_now:
@ -298,6 +299,7 @@ class CrawlConfigOps:
run_now=False,
out_filename=self.default_filename_template,
profile_filename=profile_filename or "",
warc_prefix=self.get_warc_prefix(org, crawlconfig),
)
async def update_crawl_config(
@ -841,7 +843,10 @@ class CrawlConfigOps:
try:
crawl_id = await self.crawl_manager.create_crawl_job(
crawlconfig, org.storage, userid=str(user.id)
crawlconfig,
org.storage,
userid=str(user.id),
warc_prefix=self.get_warc_prefix(org, crawlconfig),
)
await self.add_new_crawl(crawl_id, crawlconfig, user, manual=True)
return crawl_id
@ -897,6 +902,21 @@ class CrawlConfigOps:
"""Get crawler image name by id"""
return self.crawler_images_map.get(crawler_channel or "")
def get_warc_prefix(self, org: Organization, crawlconfig: CrawlConfig) -> str:
"""Generate WARC prefix slug from org slug, name or url
if no name is provided, hostname is used from url, otherwise
url is ignored"""
name = crawlconfig.name
if not name:
if crawlconfig.config.seeds and len(crawlconfig.config.seeds):
url = crawlconfig.config.seeds[0].url
parts = urllib.parse.urlsplit(url)
name = parts.netloc
name = slug_from_name(name or "")
prefix = org.slug + "-" + name
return prefix[:80]
# ============================================================================
# pylint: disable=too-many-locals

View File

@ -119,6 +119,7 @@ class CrawlManager(K8sAPI):
run_now: bool,
out_filename: str,
profile_filename: str,
warc_prefix: str,
) -> Optional[str]:
"""add new crawl, store crawl config in configmap"""
@ -139,7 +140,10 @@ class CrawlManager(K8sAPI):
if run_now:
crawl_id = await self.create_crawl_job(
crawlconfig, storage, str(crawlconfig.modifiedBy)
crawlconfig,
storage,
str(crawlconfig.modifiedBy),
warc_prefix,
)
await self._update_scheduled_job(crawlconfig)
@ -151,6 +155,7 @@ class CrawlManager(K8sAPI):
crawlconfig: CrawlConfig,
storage: StorageRef,
userid: str,
warc_prefix: str,
) -> str:
"""create new crawl job from config"""
cid = str(crawlconfig.id)
@ -169,6 +174,7 @@ class CrawlManager(K8sAPI):
crawlconfig.crawlTimeout,
crawlconfig.maxCrawlSize,
manual=True,
warc_prefix=warc_prefix,
)
async def update_crawl_config(

View File

@ -86,6 +86,7 @@ class K8sAPI:
max_crawl_size=0,
manual=True,
crawl_id=None,
warc_prefix="",
):
"""load job template from yaml"""
if not crawl_id:
@ -104,6 +105,7 @@ class K8sAPI:
"storage_name": str(storage),
"manual": "1" if manual else "0",
"crawler_channel": crawler_channel,
"warc_prefix": warc_prefix,
}
data = self.templates.env.get_template("crawl_job.yaml").render(params)

View File

@ -538,6 +538,8 @@ class BtrixOperator(K8sAPI):
params["storage_filename"] = configmap["STORE_FILENAME"]
params["restart_time"] = spec.get("restartTime")
params["warc_prefix"] = spec.get("warcPrefix")
params["redis_url"] = redis_url
if spec.get("restartTime") != status.restartTime:
@ -1651,26 +1653,10 @@ class BtrixOperator(K8sAPI):
org = await self.org_ops.get_org_by_id(UUID(oid))
crawl_id, crawljob = self.new_crawl_job_yaml(
cid,
userid=userid,
oid=oid,
storage=org.storage,
crawler_channel=configmap.get("CRAWLER_CHANNEL", "default"),
scale=int(configmap.get("INITIAL_SCALE", 1)),
crawl_timeout=int(configmap.get("CRAWL_TIMEOUT", 0)),
max_crawl_size=int(configmap.get("MAX_CRAWL_SIZE", "0")),
manual=False,
crawl_id=crawl_id,
)
attachments = list(yaml.safe_load_all(crawljob))
if crawl_id in crawljobs:
attachments[0]["status"] = crawljobs[CJS][crawl_id]["status"]
warc_prefix = None
if not actual_state:
# pylint: disable=duplicate-code
# cronjob doesn't exist yet
crawlconfig = await self.crawl_config_ops.get_crawl_config(
UUID(cid), UUID(oid)
)
@ -1686,11 +1672,35 @@ class BtrixOperator(K8sAPI):
print(f"error: missing user for id {userid}")
return {"attachments": []}
warc_prefix = self.crawl_config_ops.get_warc_prefix(org, crawlconfig)
await self.crawl_config_ops.add_new_crawl(
crawl_id, crawlconfig, user, manual=False
crawl_id,
crawlconfig,
user,
manual=False,
)
print("Scheduled Crawl Created: " + crawl_id)
crawl_id, crawljob = self.new_crawl_job_yaml(
cid,
userid=userid,
oid=oid,
storage=org.storage,
crawler_channel=configmap.get("CRAWLER_CHANNEL", "default"),
scale=int(configmap.get("INITIAL_SCALE", 1)),
crawl_timeout=int(configmap.get("CRAWL_TIMEOUT", 0)),
max_crawl_size=int(configmap.get("MAX_CRAWL_SIZE", "0")),
manual=False,
crawl_id=crawl_id,
warc_prefix=warc_prefix,
)
attachments = list(yaml.safe_load_all(crawljob))
if crawl_id in crawljobs:
attachments[0]["status"] = crawljobs[CJS][crawl_id]["status"]
return {
"attachments": attachments,
}

View File

@ -24,6 +24,7 @@ spec:
manual: {{ manual }}
crawlerChannel: "{{ crawler_channel }}"
ttlSecondsAfterFinished: 30
warcPrefix: "{{ warc_prefix }}"
storageName: "{{ storage_name }}"

View File

@ -149,6 +149,9 @@ spec:
- name: STORE_USER
value: "{{ userid }}"
- name: WARC_PREFIX
value: "{{ warc_prefix }}"
{% if crawler_socks_proxy_host %}
- name: SOCKS_HOST
value: "{{ crawler_socks_proxy_host }}"