More friendly WARC prefix inside WACZ based on Org slug + Crawl Name / First Seed URL. (#1537)
Supports setting WARC prefix for WARCs inside WACZ to `<org slug>-<slug [crawl name | first seed host]>`. - Prefix set via WARC_PREFIX env var, supported in browsertrix-crawler 1.0.0-beta.4 or higher If crawl name is provided, uses crawl name, other hostname of first seed. The name is 'sluggified', using lowercase alphanum characters separated by dashes. Ex: in an organization called `Default Org`, a crawl of `https://specs.webrecorder.net/` and no name will have WARCs named: `default-org-specs-webrecorder-net-....warc.gz` If the crawl is given the name `SPECS`, the WARCs will be named `default-org-specs-manual-....warc.gz` Fixes #412 in a default way.
This commit is contained in:
parent
ba18abc063
commit
8ae032ff88
@ -33,7 +33,7 @@ from .models import (
|
|||||||
CrawlerChannel,
|
CrawlerChannel,
|
||||||
CrawlerChannels,
|
CrawlerChannels,
|
||||||
)
|
)
|
||||||
from .utils import dt_now
|
from .utils import dt_now, slug_from_name
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from .orgs import OrgOps
|
from .orgs import OrgOps
|
||||||
@ -232,6 +232,7 @@ class CrawlConfigOps:
|
|||||||
run_now=run_now,
|
run_now=run_now,
|
||||||
out_filename=out_filename,
|
out_filename=out_filename,
|
||||||
profile_filename=profile_filename or "",
|
profile_filename=profile_filename or "",
|
||||||
|
warc_prefix=self.get_warc_prefix(org, crawlconfig),
|
||||||
)
|
)
|
||||||
|
|
||||||
if crawl_id and run_now:
|
if crawl_id and run_now:
|
||||||
@ -298,6 +299,7 @@ class CrawlConfigOps:
|
|||||||
run_now=False,
|
run_now=False,
|
||||||
out_filename=self.default_filename_template,
|
out_filename=self.default_filename_template,
|
||||||
profile_filename=profile_filename or "",
|
profile_filename=profile_filename or "",
|
||||||
|
warc_prefix=self.get_warc_prefix(org, crawlconfig),
|
||||||
)
|
)
|
||||||
|
|
||||||
async def update_crawl_config(
|
async def update_crawl_config(
|
||||||
@ -841,7 +843,10 @@ class CrawlConfigOps:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
crawl_id = await self.crawl_manager.create_crawl_job(
|
crawl_id = await self.crawl_manager.create_crawl_job(
|
||||||
crawlconfig, org.storage, userid=str(user.id)
|
crawlconfig,
|
||||||
|
org.storage,
|
||||||
|
userid=str(user.id),
|
||||||
|
warc_prefix=self.get_warc_prefix(org, crawlconfig),
|
||||||
)
|
)
|
||||||
await self.add_new_crawl(crawl_id, crawlconfig, user, manual=True)
|
await self.add_new_crawl(crawl_id, crawlconfig, user, manual=True)
|
||||||
return crawl_id
|
return crawl_id
|
||||||
@ -897,6 +902,21 @@ class CrawlConfigOps:
|
|||||||
"""Get crawler image name by id"""
|
"""Get crawler image name by id"""
|
||||||
return self.crawler_images_map.get(crawler_channel or "")
|
return self.crawler_images_map.get(crawler_channel or "")
|
||||||
|
|
||||||
|
def get_warc_prefix(self, org: Organization, crawlconfig: CrawlConfig) -> str:
|
||||||
|
"""Generate WARC prefix slug from org slug, name or url
|
||||||
|
if no name is provided, hostname is used from url, otherwise
|
||||||
|
url is ignored"""
|
||||||
|
name = crawlconfig.name
|
||||||
|
if not name:
|
||||||
|
if crawlconfig.config.seeds and len(crawlconfig.config.seeds):
|
||||||
|
url = crawlconfig.config.seeds[0].url
|
||||||
|
parts = urllib.parse.urlsplit(url)
|
||||||
|
name = parts.netloc
|
||||||
|
|
||||||
|
name = slug_from_name(name or "")
|
||||||
|
prefix = org.slug + "-" + name
|
||||||
|
return prefix[:80]
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
# pylint: disable=too-many-locals
|
# pylint: disable=too-many-locals
|
||||||
|
|||||||
@ -119,6 +119,7 @@ class CrawlManager(K8sAPI):
|
|||||||
run_now: bool,
|
run_now: bool,
|
||||||
out_filename: str,
|
out_filename: str,
|
||||||
profile_filename: str,
|
profile_filename: str,
|
||||||
|
warc_prefix: str,
|
||||||
) -> Optional[str]:
|
) -> Optional[str]:
|
||||||
"""add new crawl, store crawl config in configmap"""
|
"""add new crawl, store crawl config in configmap"""
|
||||||
|
|
||||||
@ -139,7 +140,10 @@ class CrawlManager(K8sAPI):
|
|||||||
|
|
||||||
if run_now:
|
if run_now:
|
||||||
crawl_id = await self.create_crawl_job(
|
crawl_id = await self.create_crawl_job(
|
||||||
crawlconfig, storage, str(crawlconfig.modifiedBy)
|
crawlconfig,
|
||||||
|
storage,
|
||||||
|
str(crawlconfig.modifiedBy),
|
||||||
|
warc_prefix,
|
||||||
)
|
)
|
||||||
|
|
||||||
await self._update_scheduled_job(crawlconfig)
|
await self._update_scheduled_job(crawlconfig)
|
||||||
@ -151,6 +155,7 @@ class CrawlManager(K8sAPI):
|
|||||||
crawlconfig: CrawlConfig,
|
crawlconfig: CrawlConfig,
|
||||||
storage: StorageRef,
|
storage: StorageRef,
|
||||||
userid: str,
|
userid: str,
|
||||||
|
warc_prefix: str,
|
||||||
) -> str:
|
) -> str:
|
||||||
"""create new crawl job from config"""
|
"""create new crawl job from config"""
|
||||||
cid = str(crawlconfig.id)
|
cid = str(crawlconfig.id)
|
||||||
@ -169,6 +174,7 @@ class CrawlManager(K8sAPI):
|
|||||||
crawlconfig.crawlTimeout,
|
crawlconfig.crawlTimeout,
|
||||||
crawlconfig.maxCrawlSize,
|
crawlconfig.maxCrawlSize,
|
||||||
manual=True,
|
manual=True,
|
||||||
|
warc_prefix=warc_prefix,
|
||||||
)
|
)
|
||||||
|
|
||||||
async def update_crawl_config(
|
async def update_crawl_config(
|
||||||
|
|||||||
@ -86,6 +86,7 @@ class K8sAPI:
|
|||||||
max_crawl_size=0,
|
max_crawl_size=0,
|
||||||
manual=True,
|
manual=True,
|
||||||
crawl_id=None,
|
crawl_id=None,
|
||||||
|
warc_prefix="",
|
||||||
):
|
):
|
||||||
"""load job template from yaml"""
|
"""load job template from yaml"""
|
||||||
if not crawl_id:
|
if not crawl_id:
|
||||||
@ -104,6 +105,7 @@ class K8sAPI:
|
|||||||
"storage_name": str(storage),
|
"storage_name": str(storage),
|
||||||
"manual": "1" if manual else "0",
|
"manual": "1" if manual else "0",
|
||||||
"crawler_channel": crawler_channel,
|
"crawler_channel": crawler_channel,
|
||||||
|
"warc_prefix": warc_prefix,
|
||||||
}
|
}
|
||||||
|
|
||||||
data = self.templates.env.get_template("crawl_job.yaml").render(params)
|
data = self.templates.env.get_template("crawl_job.yaml").render(params)
|
||||||
|
|||||||
@ -538,6 +538,8 @@ class BtrixOperator(K8sAPI):
|
|||||||
params["storage_filename"] = configmap["STORE_FILENAME"]
|
params["storage_filename"] = configmap["STORE_FILENAME"]
|
||||||
params["restart_time"] = spec.get("restartTime")
|
params["restart_time"] = spec.get("restartTime")
|
||||||
|
|
||||||
|
params["warc_prefix"] = spec.get("warcPrefix")
|
||||||
|
|
||||||
params["redis_url"] = redis_url
|
params["redis_url"] = redis_url
|
||||||
|
|
||||||
if spec.get("restartTime") != status.restartTime:
|
if spec.get("restartTime") != status.restartTime:
|
||||||
@ -1651,26 +1653,10 @@ class BtrixOperator(K8sAPI):
|
|||||||
|
|
||||||
org = await self.org_ops.get_org_by_id(UUID(oid))
|
org = await self.org_ops.get_org_by_id(UUID(oid))
|
||||||
|
|
||||||
crawl_id, crawljob = self.new_crawl_job_yaml(
|
warc_prefix = None
|
||||||
cid,
|
|
||||||
userid=userid,
|
|
||||||
oid=oid,
|
|
||||||
storage=org.storage,
|
|
||||||
crawler_channel=configmap.get("CRAWLER_CHANNEL", "default"),
|
|
||||||
scale=int(configmap.get("INITIAL_SCALE", 1)),
|
|
||||||
crawl_timeout=int(configmap.get("CRAWL_TIMEOUT", 0)),
|
|
||||||
max_crawl_size=int(configmap.get("MAX_CRAWL_SIZE", "0")),
|
|
||||||
manual=False,
|
|
||||||
crawl_id=crawl_id,
|
|
||||||
)
|
|
||||||
|
|
||||||
attachments = list(yaml.safe_load_all(crawljob))
|
|
||||||
|
|
||||||
if crawl_id in crawljobs:
|
|
||||||
attachments[0]["status"] = crawljobs[CJS][crawl_id]["status"]
|
|
||||||
|
|
||||||
if not actual_state:
|
if not actual_state:
|
||||||
# pylint: disable=duplicate-code
|
# cronjob doesn't exist yet
|
||||||
crawlconfig = await self.crawl_config_ops.get_crawl_config(
|
crawlconfig = await self.crawl_config_ops.get_crawl_config(
|
||||||
UUID(cid), UUID(oid)
|
UUID(cid), UUID(oid)
|
||||||
)
|
)
|
||||||
@ -1686,11 +1672,35 @@ class BtrixOperator(K8sAPI):
|
|||||||
print(f"error: missing user for id {userid}")
|
print(f"error: missing user for id {userid}")
|
||||||
return {"attachments": []}
|
return {"attachments": []}
|
||||||
|
|
||||||
|
warc_prefix = self.crawl_config_ops.get_warc_prefix(org, crawlconfig)
|
||||||
|
|
||||||
await self.crawl_config_ops.add_new_crawl(
|
await self.crawl_config_ops.add_new_crawl(
|
||||||
crawl_id, crawlconfig, user, manual=False
|
crawl_id,
|
||||||
|
crawlconfig,
|
||||||
|
user,
|
||||||
|
manual=False,
|
||||||
)
|
)
|
||||||
print("Scheduled Crawl Created: " + crawl_id)
|
print("Scheduled Crawl Created: " + crawl_id)
|
||||||
|
|
||||||
|
crawl_id, crawljob = self.new_crawl_job_yaml(
|
||||||
|
cid,
|
||||||
|
userid=userid,
|
||||||
|
oid=oid,
|
||||||
|
storage=org.storage,
|
||||||
|
crawler_channel=configmap.get("CRAWLER_CHANNEL", "default"),
|
||||||
|
scale=int(configmap.get("INITIAL_SCALE", 1)),
|
||||||
|
crawl_timeout=int(configmap.get("CRAWL_TIMEOUT", 0)),
|
||||||
|
max_crawl_size=int(configmap.get("MAX_CRAWL_SIZE", "0")),
|
||||||
|
manual=False,
|
||||||
|
crawl_id=crawl_id,
|
||||||
|
warc_prefix=warc_prefix,
|
||||||
|
)
|
||||||
|
|
||||||
|
attachments = list(yaml.safe_load_all(crawljob))
|
||||||
|
|
||||||
|
if crawl_id in crawljobs:
|
||||||
|
attachments[0]["status"] = crawljobs[CJS][crawl_id]["status"]
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"attachments": attachments,
|
"attachments": attachments,
|
||||||
}
|
}
|
||||||
|
|||||||
@ -24,6 +24,7 @@ spec:
|
|||||||
manual: {{ manual }}
|
manual: {{ manual }}
|
||||||
crawlerChannel: "{{ crawler_channel }}"
|
crawlerChannel: "{{ crawler_channel }}"
|
||||||
ttlSecondsAfterFinished: 30
|
ttlSecondsAfterFinished: 30
|
||||||
|
warcPrefix: "{{ warc_prefix }}"
|
||||||
|
|
||||||
storageName: "{{ storage_name }}"
|
storageName: "{{ storage_name }}"
|
||||||
|
|
||||||
|
|||||||
@ -149,6 +149,9 @@ spec:
|
|||||||
- name: STORE_USER
|
- name: STORE_USER
|
||||||
value: "{{ userid }}"
|
value: "{{ userid }}"
|
||||||
|
|
||||||
|
- name: WARC_PREFIX
|
||||||
|
value: "{{ warc_prefix }}"
|
||||||
|
|
||||||
{% if crawler_socks_proxy_host %}
|
{% if crawler_socks_proxy_host %}
|
||||||
- name: SOCKS_HOST
|
- name: SOCKS_HOST
|
||||||
value: "{{ crawler_socks_proxy_host }}"
|
value: "{{ crawler_socks_proxy_host }}"
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user