More friendly WARC prefix inside WACZ based on Org slug + Crawl Name / First Seed URL. (#1537)
Supports setting WARC prefix for WARCs inside WACZ to `<org slug>-<slug [crawl name | first seed host]>`. - Prefix set via WARC_PREFIX env var, supported in browsertrix-crawler 1.0.0-beta.4 or higher If crawl name is provided, uses crawl name, other hostname of first seed. The name is 'sluggified', using lowercase alphanum characters separated by dashes. Ex: in an organization called `Default Org`, a crawl of `https://specs.webrecorder.net/` and no name will have WARCs named: `default-org-specs-webrecorder-net-....warc.gz` If the crawl is given the name `SPECS`, the WARCs will be named `default-org-specs-manual-....warc.gz` Fixes #412 in a default way.
This commit is contained in:
parent
ba18abc063
commit
8ae032ff88
@ -33,7 +33,7 @@ from .models import (
|
||||
CrawlerChannel,
|
||||
CrawlerChannels,
|
||||
)
|
||||
from .utils import dt_now
|
||||
from .utils import dt_now, slug_from_name
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .orgs import OrgOps
|
||||
@ -232,6 +232,7 @@ class CrawlConfigOps:
|
||||
run_now=run_now,
|
||||
out_filename=out_filename,
|
||||
profile_filename=profile_filename or "",
|
||||
warc_prefix=self.get_warc_prefix(org, crawlconfig),
|
||||
)
|
||||
|
||||
if crawl_id and run_now:
|
||||
@ -298,6 +299,7 @@ class CrawlConfigOps:
|
||||
run_now=False,
|
||||
out_filename=self.default_filename_template,
|
||||
profile_filename=profile_filename or "",
|
||||
warc_prefix=self.get_warc_prefix(org, crawlconfig),
|
||||
)
|
||||
|
||||
async def update_crawl_config(
|
||||
@ -841,7 +843,10 @@ class CrawlConfigOps:
|
||||
|
||||
try:
|
||||
crawl_id = await self.crawl_manager.create_crawl_job(
|
||||
crawlconfig, org.storage, userid=str(user.id)
|
||||
crawlconfig,
|
||||
org.storage,
|
||||
userid=str(user.id),
|
||||
warc_prefix=self.get_warc_prefix(org, crawlconfig),
|
||||
)
|
||||
await self.add_new_crawl(crawl_id, crawlconfig, user, manual=True)
|
||||
return crawl_id
|
||||
@ -897,6 +902,21 @@ class CrawlConfigOps:
|
||||
"""Get crawler image name by id"""
|
||||
return self.crawler_images_map.get(crawler_channel or "")
|
||||
|
||||
def get_warc_prefix(self, org: Organization, crawlconfig: CrawlConfig) -> str:
|
||||
"""Generate WARC prefix slug from org slug, name or url
|
||||
if no name is provided, hostname is used from url, otherwise
|
||||
url is ignored"""
|
||||
name = crawlconfig.name
|
||||
if not name:
|
||||
if crawlconfig.config.seeds and len(crawlconfig.config.seeds):
|
||||
url = crawlconfig.config.seeds[0].url
|
||||
parts = urllib.parse.urlsplit(url)
|
||||
name = parts.netloc
|
||||
|
||||
name = slug_from_name(name or "")
|
||||
prefix = org.slug + "-" + name
|
||||
return prefix[:80]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# pylint: disable=too-many-locals
|
||||
|
||||
@ -119,6 +119,7 @@ class CrawlManager(K8sAPI):
|
||||
run_now: bool,
|
||||
out_filename: str,
|
||||
profile_filename: str,
|
||||
warc_prefix: str,
|
||||
) -> Optional[str]:
|
||||
"""add new crawl, store crawl config in configmap"""
|
||||
|
||||
@ -139,7 +140,10 @@ class CrawlManager(K8sAPI):
|
||||
|
||||
if run_now:
|
||||
crawl_id = await self.create_crawl_job(
|
||||
crawlconfig, storage, str(crawlconfig.modifiedBy)
|
||||
crawlconfig,
|
||||
storage,
|
||||
str(crawlconfig.modifiedBy),
|
||||
warc_prefix,
|
||||
)
|
||||
|
||||
await self._update_scheduled_job(crawlconfig)
|
||||
@ -151,6 +155,7 @@ class CrawlManager(K8sAPI):
|
||||
crawlconfig: CrawlConfig,
|
||||
storage: StorageRef,
|
||||
userid: str,
|
||||
warc_prefix: str,
|
||||
) -> str:
|
||||
"""create new crawl job from config"""
|
||||
cid = str(crawlconfig.id)
|
||||
@ -169,6 +174,7 @@ class CrawlManager(K8sAPI):
|
||||
crawlconfig.crawlTimeout,
|
||||
crawlconfig.maxCrawlSize,
|
||||
manual=True,
|
||||
warc_prefix=warc_prefix,
|
||||
)
|
||||
|
||||
async def update_crawl_config(
|
||||
|
||||
@ -86,6 +86,7 @@ class K8sAPI:
|
||||
max_crawl_size=0,
|
||||
manual=True,
|
||||
crawl_id=None,
|
||||
warc_prefix="",
|
||||
):
|
||||
"""load job template from yaml"""
|
||||
if not crawl_id:
|
||||
@ -104,6 +105,7 @@ class K8sAPI:
|
||||
"storage_name": str(storage),
|
||||
"manual": "1" if manual else "0",
|
||||
"crawler_channel": crawler_channel,
|
||||
"warc_prefix": warc_prefix,
|
||||
}
|
||||
|
||||
data = self.templates.env.get_template("crawl_job.yaml").render(params)
|
||||
|
||||
@ -538,6 +538,8 @@ class BtrixOperator(K8sAPI):
|
||||
params["storage_filename"] = configmap["STORE_FILENAME"]
|
||||
params["restart_time"] = spec.get("restartTime")
|
||||
|
||||
params["warc_prefix"] = spec.get("warcPrefix")
|
||||
|
||||
params["redis_url"] = redis_url
|
||||
|
||||
if spec.get("restartTime") != status.restartTime:
|
||||
@ -1651,26 +1653,10 @@ class BtrixOperator(K8sAPI):
|
||||
|
||||
org = await self.org_ops.get_org_by_id(UUID(oid))
|
||||
|
||||
crawl_id, crawljob = self.new_crawl_job_yaml(
|
||||
cid,
|
||||
userid=userid,
|
||||
oid=oid,
|
||||
storage=org.storage,
|
||||
crawler_channel=configmap.get("CRAWLER_CHANNEL", "default"),
|
||||
scale=int(configmap.get("INITIAL_SCALE", 1)),
|
||||
crawl_timeout=int(configmap.get("CRAWL_TIMEOUT", 0)),
|
||||
max_crawl_size=int(configmap.get("MAX_CRAWL_SIZE", "0")),
|
||||
manual=False,
|
||||
crawl_id=crawl_id,
|
||||
)
|
||||
|
||||
attachments = list(yaml.safe_load_all(crawljob))
|
||||
|
||||
if crawl_id in crawljobs:
|
||||
attachments[0]["status"] = crawljobs[CJS][crawl_id]["status"]
|
||||
warc_prefix = None
|
||||
|
||||
if not actual_state:
|
||||
# pylint: disable=duplicate-code
|
||||
# cronjob doesn't exist yet
|
||||
crawlconfig = await self.crawl_config_ops.get_crawl_config(
|
||||
UUID(cid), UUID(oid)
|
||||
)
|
||||
@ -1686,11 +1672,35 @@ class BtrixOperator(K8sAPI):
|
||||
print(f"error: missing user for id {userid}")
|
||||
return {"attachments": []}
|
||||
|
||||
warc_prefix = self.crawl_config_ops.get_warc_prefix(org, crawlconfig)
|
||||
|
||||
await self.crawl_config_ops.add_new_crawl(
|
||||
crawl_id, crawlconfig, user, manual=False
|
||||
crawl_id,
|
||||
crawlconfig,
|
||||
user,
|
||||
manual=False,
|
||||
)
|
||||
print("Scheduled Crawl Created: " + crawl_id)
|
||||
|
||||
crawl_id, crawljob = self.new_crawl_job_yaml(
|
||||
cid,
|
||||
userid=userid,
|
||||
oid=oid,
|
||||
storage=org.storage,
|
||||
crawler_channel=configmap.get("CRAWLER_CHANNEL", "default"),
|
||||
scale=int(configmap.get("INITIAL_SCALE", 1)),
|
||||
crawl_timeout=int(configmap.get("CRAWL_TIMEOUT", 0)),
|
||||
max_crawl_size=int(configmap.get("MAX_CRAWL_SIZE", "0")),
|
||||
manual=False,
|
||||
crawl_id=crawl_id,
|
||||
warc_prefix=warc_prefix,
|
||||
)
|
||||
|
||||
attachments = list(yaml.safe_load_all(crawljob))
|
||||
|
||||
if crawl_id in crawljobs:
|
||||
attachments[0]["status"] = crawljobs[CJS][crawl_id]["status"]
|
||||
|
||||
return {
|
||||
"attachments": attachments,
|
||||
}
|
||||
|
||||
@ -24,6 +24,7 @@ spec:
|
||||
manual: {{ manual }}
|
||||
crawlerChannel: "{{ crawler_channel }}"
|
||||
ttlSecondsAfterFinished: 30
|
||||
warcPrefix: "{{ warc_prefix }}"
|
||||
|
||||
storageName: "{{ storage_name }}"
|
||||
|
||||
|
||||
@ -149,6 +149,9 @@ spec:
|
||||
- name: STORE_USER
|
||||
value: "{{ userid }}"
|
||||
|
||||
- name: WARC_PREFIX
|
||||
value: "{{ warc_prefix }}"
|
||||
|
||||
{% if crawler_socks_proxy_host %}
|
||||
- name: SOCKS_HOST
|
||||
value: "{{ crawler_socks_proxy_host }}"
|
||||
|
||||
Loading…
Reference in New Issue
Block a user