For single page crawls: - Always force 1 browser to be used, ignoring browser windows/scale setting - Don't use custom PVC volumes in crawler / redis, just use emptyDir - no chance of crawler being interrupted and restarted on different machine for a single page. Adds a 'is_single_page' check to CrawlConfig, checking for either limit or scopeType / no extra hops. Fixes #2655
		
			
				
	
	
		
			44 lines
		
	
	
		
			995 B
		
	
	
	
		
			YAML
		
	
	
	
	
	
			
		
		
	
	
			44 lines
		
	
	
		
			995 B
		
	
	
	
		
			YAML
		
	
	
	
	
	
| apiVersion: btrix.cloud/v1
 | |
| kind: CrawlJob
 | |
| metadata:
 | |
|   name: crawljob-{{ id }}
 | |
|   labels:
 | |
|     crawl: "{{ id }}"
 | |
|     role: {{ "qa-job" if qa_source else "job" }}
 | |
|     btrix.org: "{{ oid }}"
 | |
|     btrix.user: "{{ userid }}"
 | |
|     btrix.storage: "{{ storage_name }}"
 | |
| 
 | |
| spec:
 | |
|   selector:
 | |
|     matchLabels:
 | |
|       crawl: "{{ id }}"
 | |
| 
 | |
|   id: "{{ id }}"
 | |
|   userid: "{{ userid }}"
 | |
|   cid: "{{ cid }}"
 | |
|   oid: "{{ oid }}"
 | |
|   scale: {{ scale }}
 | |
|   browserWindows: {{ browser_windows }}
 | |
| 
 | |
|   profile_filename: "{{ profile_filename }}"
 | |
|   storage_filename: "{{ storage_filename }}"
 | |
| 
 | |
|   maxCrawlSize: {{ max_crawl_size if not qa_source else 0 }}
 | |
|   timeout: {{ timeout if not qa_source else 0 }}
 | |
|   qaSourceCrawlId: "{{ qa_source }}"
 | |
| 
 | |
|   manual: {{ manual }}
 | |
|   crawlerChannel: "{{ crawler_channel }}"
 | |
|   ttlSecondsAfterFinished: {{ 30 if not qa_source else 0 }}
 | |
|   warcPrefix: "{{ warc_prefix }}"
 | |
| 
 | |
|   storageName: "{{ storage_name }}"
 | |
| 
 | |
|   proxyId: "{{ proxy_id }}"
 | |
| 
 | |
|   pausedAt: "{{ pausedAt }}"
 | |
| 
 | |
|   isSinglePage: "{{ is_single_page }}"
 | |
| 
 |