fix issue with incorrect number of total pages if any of the seeds is a redirect (#1649)
Following changes in webrecorder/browsertrix-crawler#475, webrecorder/browsertrix-crawler#509, the crawler adds a redirected seed to the seen list. To account for this, it needs to be subtracted to get the total page count.
This commit is contained in:
		
							parent
							
								
									83c9203a11
								
							
						
					
					
						commit
						5c08c9679c
					
				| @ -1178,6 +1178,11 @@ class CrawlOperator(BaseOperator): | |||||||
|             pages_done = await redis.llen(f"{crawl_id}:d") |             pages_done = await redis.llen(f"{crawl_id}:d") | ||||||
| 
 | 
 | ||||||
|         pages_found = await redis.scard(f"{crawl_id}:s") |         pages_found = await redis.scard(f"{crawl_id}:s") | ||||||
|  |         # account for extra seeds and subtract from seen list | ||||||
|  |         extra_seeds = await redis.llen(f"{crawl_id}:extraSeeds") | ||||||
|  |         if extra_seeds: | ||||||
|  |             pages_found -= extra_seeds | ||||||
|  | 
 | ||||||
|         sizes = await redis.hgetall(f"{crawl_id}:size") |         sizes = await redis.hgetall(f"{crawl_id}:size") | ||||||
|         archive_size = sum(int(x) for x in sizes.values()) |         archive_size = sum(int(x) for x in sizes.values()) | ||||||
| 
 | 
 | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user