fix issue with incorrect number of total pages if any of the seeds is a redirect (#1649)
Following changes in webrecorder/browsertrix-crawler#475, webrecorder/browsertrix-crawler#509, the crawler adds a redirected seed to the seen list. To account for this, it needs to be subtracted to get the total page count.
This commit is contained in:
		
							parent
							
								
									83c9203a11
								
							
						
					
					
						commit
						5c08c9679c
					
				| @ -1178,6 +1178,11 @@ class CrawlOperator(BaseOperator): | ||||
|             pages_done = await redis.llen(f"{crawl_id}:d") | ||||
| 
 | ||||
|         pages_found = await redis.scard(f"{crawl_id}:s") | ||||
|         # account for extra seeds and subtract from seen list | ||||
|         extra_seeds = await redis.llen(f"{crawl_id}:extraSeeds") | ||||
|         if extra_seeds: | ||||
|             pages_found -= extra_seeds | ||||
| 
 | ||||
|         sizes = await redis.hgetall(f"{crawl_id}:size") | ||||
|         archive_size = sum(int(x) for x in sizes.values()) | ||||
| 
 | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user