Supports setting WARC prefix for WARCs inside WACZ to `<org slug>-<slug [crawl name | first seed host]>`. - Prefix set via WARC_PREFIX env var, supported in browsertrix-crawler 1.0.0-beta.4 or higher If crawl name is provided, uses crawl name, other hostname of first seed. The name is 'sluggified', using lowercase alphanum characters separated by dashes. Ex: in an organization called `Default Org`, a crawl of `https://specs.webrecorder.net/` and no name will have WARCs named: `default-org-specs-webrecorder-net-....warc.gz` If the crawl is given the name `SPECS`, the WARCs will be named `default-org-specs-manual-....warc.gz` Fixes #412 in a default way.
		
			
				
	
	
		
			31 lines
		
	
	
		
			615 B
		
	
	
	
		
			YAML
		
	
	
	
	
	
			
		
		
	
	
			31 lines
		
	
	
		
			615 B
		
	
	
	
		
			YAML
		
	
	
	
	
	
| apiVersion: btrix.cloud/v1
 | |
| kind: CrawlJob
 | |
| metadata:
 | |
|   name: crawljob-{{ id }}
 | |
|   labels:
 | |
|     crawl: "{{ id }}"
 | |
|     role: "job"
 | |
|     btrix.org: "{{ oid }}"
 | |
|     btrix.user: "{{ userid }}"
 | |
|     btrix.storage: "{{ storage_name }}"
 | |
| 
 | |
| spec:
 | |
|   selector:
 | |
|     matchLabels:
 | |
|       crawl: "{{ id }}"
 | |
| 
 | |
|   id: "{{ id }}"
 | |
|   userid: "{{ userid }}"
 | |
|   cid: "{{ cid }}"
 | |
|   oid: "{{ oid }}"
 | |
|   scale: {{ scale }}
 | |
|   maxCrawlSize: {{ max_crawl_size }}
 | |
|   timeout: {{ timeout }}
 | |
|   manual: {{ manual }}
 | |
|   crawlerChannel: "{{ crawler_channel }}"
 | |
|   ttlSecondsAfterFinished: 30
 | |
|   warcPrefix: "{{ warc_prefix }}"
 | |
| 
 | |
|   storageName: "{{ storage_name }}"
 | |
| 
 |