Temporarily remove pages migration (#1572)

Removing until we have a better tested solution, including to avoid testing of QA runs for new crawls in beta.
This commit is contained in:
Tessa Walsh 2024-03-04 13:30:04 -05:00 committed by GitHub
parent 144000c7a3
commit ec0db1c323
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 1 additions and 48 deletions

View File

@ -17,7 +17,7 @@ from pymongo.errors import InvalidName
from .migrations import BaseMigration
CURR_DB_VERSION = "0026"
CURR_DB_VERSION = "0025"
# ============================================================================

View File

@ -1,47 +0,0 @@
"""
Migration 0026 -- Crawl Pages
"""
from btrixcloud.migrations import BaseMigration
from btrixcloud.utils import gather_tasks_with_concurrency
MIGRATION_VERSION = "0026"
class Migration(BaseMigration):
"""Migration class."""
def __init__(self, mdb, **kwargs):
super().__init__(mdb, migration_version=MIGRATION_VERSION)
self.page_ops = kwargs["page_ops"]
async def migrate_up(self):
"""Perform migration up.
Add pages to database for each crawl without them, pulling from WACZ files.
"""
# pylint: disable=duplicate-code
crawls_mdb = self.mdb["crawls"]
pages_mdb = self.mdb["pages"]
crawl_ids = await crawls_mdb.distinct(
"_id", {"type": "crawl", "finished": {"$ne": None}}
)
crawl_ids_with_pages = await pages_mdb.distinct("crawl_id")
crawl_ids_no_pages = list(set(crawl_ids) - set(crawl_ids_with_pages))
if not crawl_ids_no_pages:
return
all_coroutines = []
for crawl_id in crawl_ids_no_pages:
current_coroutine = self.page_ops.add_crawl_pages_to_db_from_wacz(crawl_id)
all_coroutines.append(current_coroutine)
try:
await gather_tasks_with_concurrency(*all_coroutines)
# pylint: disable=broad-exception-caught, raise-missing-from
except Exception as err:
print(f"Error adding pages to db: {err}", flush=True)