* Re-implement pagination and paginate crawlconfig revs First step toward simplifying pagination to set us up for sorting and filtering of list endpoints. This commit removes fastapi-pagination as a dependency. * Migrate all HttpUrl seeds to Seeds This commit also updates the frontend to always use Seeds and to fix display issues resulting from the change. * Filter and sort crawls and workflows Crawls: - Filter by createdBy (via userid param) - Filter by state (comma-separated string for multiple values) - Filter by first_seed, name, description - Sort by started, finished, fileSize, firstSeed - Sort descending by default to match frontend Workflows: - Filter by createdBy (formerly userid) and modifiedBy - Filter by first_seed, name, description - Sort by created, modified, firstSeed, lastCrawlTime * Add crawlconfigs search-values API endpoint and test
		
			
				
	
	
		
			119 lines
		
	
	
		
			4.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			119 lines
		
	
	
		
			4.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| """
 | |
| Migration 0002 - Ensuring all config.seeds are Seeds not HttpUrls
 | |
| """
 | |
| from pydantic import HttpUrl
 | |
| 
 | |
| from btrixcloud.crawlconfigs import CrawlConfig, ScopeType, Seed
 | |
| from btrixcloud.crawls import Crawl
 | |
| from btrixcloud.migrations import BaseMigration
 | |
| 
 | |
| 
 | |
| MIGRATION_VERSION = "0004"
 | |
| 
 | |
| 
 | |
| class Migration(BaseMigration):
 | |
|     """Migration class."""
 | |
| 
 | |
|     def __init__(self, mdb, migration_version=MIGRATION_VERSION):
 | |
|         super().__init__(mdb, migration_version)
 | |
| 
 | |
|     async def migrate_up(self):
 | |
|         """Perform migration up.
 | |
| 
 | |
|         Convert any crawlconfig.config.seed HttpUrl values to Seeds with url value.
 | |
|         """
 | |
|         # pylint: disable=too-many-branches
 | |
| 
 | |
|         # Migrate workflows
 | |
|         crawl_configs = self.mdb["crawl_configs"]
 | |
|         crawl_config_results = [res async for res in crawl_configs.find({})]
 | |
|         if not crawl_config_results:
 | |
|             return
 | |
| 
 | |
|         for config_dict in crawl_config_results:
 | |
|             seeds_to_migrate = []
 | |
|             seed_dicts = []
 | |
| 
 | |
|             seed_list = config_dict["config"]["seeds"]
 | |
|             for seed in seed_list:
 | |
|                 if isinstance(seed, HttpUrl):
 | |
|                     new_seed = Seed(url=str(seed.url), scopeType=ScopeType.PAGE)
 | |
|                     seeds_to_migrate.append(new_seed)
 | |
|                 elif isinstance(seed, str):
 | |
|                     new_seed = Seed(url=str(seed), scopeType=ScopeType.PAGE)
 | |
|                     seeds_to_migrate.append(new_seed)
 | |
|                 elif isinstance(seed, Seed):
 | |
|                     seeds_to_migrate.append(seed)
 | |
| 
 | |
|             for seed in seeds_to_migrate:
 | |
|                 seed_dict = {
 | |
|                     "url": str(seed.url),
 | |
|                     "scopeType": seed.scopeType,
 | |
|                     "include": seed.include,
 | |
|                     "exclude": seed.exclude,
 | |
|                     "sitemap": seed.sitemap,
 | |
|                     "allowHash": seed.allowHash,
 | |
|                     "depth": seed.depth,
 | |
|                     "extraHops": seed.extraHops,
 | |
|                 }
 | |
|                 seed_dicts.append(seed_dict)
 | |
| 
 | |
|             if seed_dicts:
 | |
|                 await crawl_configs.find_one_and_update(
 | |
|                     {"_id": config_dict["_id"]},
 | |
|                     {"$set": {"config.seeds": seed_dicts}},
 | |
|                 )
 | |
| 
 | |
|         # Migrate seeds copied into crawls
 | |
|         crawls = self.mdb["crawls"]
 | |
|         crawl_results = [res async for res in crawls.find({})]
 | |
| 
 | |
|         for crawl_dict in crawl_results:
 | |
|             seeds_to_migrate = []
 | |
|             seed_dicts = []
 | |
| 
 | |
|             seed_list = crawl_dict["config"]["seeds"]
 | |
|             for seed in seed_list:
 | |
|                 if isinstance(seed, HttpUrl):
 | |
|                     new_seed = Seed(url=str(seed.url), scopeType=ScopeType.PAGE)
 | |
|                     seeds_to_migrate.append(new_seed)
 | |
|                 elif isinstance(seed, str):
 | |
|                     new_seed = Seed(url=str(seed), scopeType=ScopeType.PAGE)
 | |
|                     seeds_to_migrate.append(new_seed)
 | |
|                 elif isinstance(seed, Seed):
 | |
|                     seeds_to_migrate.append(seed)
 | |
| 
 | |
|             for seed in seeds_to_migrate:
 | |
|                 seed_dict = {
 | |
|                     "url": str(seed.url),
 | |
|                     "scopeType": seed.scopeType,
 | |
|                     "include": seed.include,
 | |
|                     "exclude": seed.exclude,
 | |
|                     "sitemap": seed.sitemap,
 | |
|                     "allowHash": seed.allowHash,
 | |
|                     "depth": seed.depth,
 | |
|                     "extraHops": seed.extraHops,
 | |
|                 }
 | |
|                 seed_dicts.append(seed_dict)
 | |
| 
 | |
|             if seed_dicts:
 | |
|                 await crawls.find_one_and_update(
 | |
|                     {"_id": crawl_dict["_id"]},
 | |
|                     {"$set": {"config.seeds": seed_dicts}},
 | |
|                 )
 | |
| 
 | |
|         # Test migration
 | |
|         crawl_config_results = [res async for res in crawl_configs.find({})]
 | |
|         for config_dict in crawl_config_results:
 | |
|             config = CrawlConfig.from_dict(config_dict)
 | |
|             for seed in config.config.seeds:
 | |
|                 assert isinstance(seed, Seed)
 | |
|                 assert seed.url
 | |
| 
 | |
|         crawl_results = [res async for res in crawls.find({})]
 | |
|         for crawl_dict in crawl_results:
 | |
|             crawl = Crawl.from_dict(crawl_dict)
 | |
|             for seed in crawl.config.seeds:
 | |
|                 assert isinstance(seed, Seed)
 | |
|                 assert seed.url
 |