Copy tags from crawlconfig to crawl (#467), fixes #466

- add tags to crawl object
- ensure tags are copied from crawlconfig to crawl when crawl is created (both manually and scheduled)
- tests: add test to ensure tags added to crawl, remove redundant wait replaced with fixtures
This commit is contained in:
Ilya Kreymer 2023-01-12 17:46:19 -08:00 committed by GitHub
parent 49460bb070
commit 2daa742585
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 24 additions and 20 deletions

View File

@ -47,6 +47,7 @@ class CrawlJob(ABC):
self.userid = uuid.UUID(os.environ["USER_ID"]) self.userid = uuid.UUID(os.environ["USER_ID"])
self.is_manual = os.environ.get("RUN_MANUAL") == "1" self.is_manual = os.environ.get("RUN_MANUAL") == "1"
self.tags = os.environ.get("TAGS", "").split(",")
self.scale = int(os.environ.get("INITIAL_SCALE") or 0) self.scale = int(os.environ.get("INITIAL_SCALE") or 0)
@ -360,6 +361,7 @@ class CrawlJob(ABC):
manual=self.is_manual, manual=self.is_manual,
scale=scale, scale=scale,
started=self.started, started=self.started,
tags=self.tags,
# colls=json.loads(job.metadata.annotations.get("btrix.colls", [])), # colls=json.loads(job.metadata.annotations.get("btrix.colls", [])),
) )

View File

@ -183,6 +183,7 @@ class BaseCrawlManager(ABC):
"schedule": schedule, "schedule": schedule,
"env": os.environ, "env": os.environ,
"mongo_db_url": resolve_db_url(), "mongo_db_url": resolve_db_url(),
"tags": ",".join(crawlconfig.tags),
} }
return self.templates.env.get_template("crawl_job.yaml").render(params) return self.templates.env.get_template("crawl_job.yaml").render(params)

View File

@ -84,6 +84,7 @@ class Crawl(BaseMongoModel):
files: Optional[List[CrawlFile]] = [] files: Optional[List[CrawlFile]] = []
colls: Optional[List[str]] = [] colls: Optional[List[str]] = []
tags: Optional[List[str]] = []
# ============================================================================ # ============================================================================
@ -121,6 +122,7 @@ class ListCrawlOut(BaseMongoModel):
fileCount: int = 0 fileCount: int = 0
colls: Optional[List[str]] = [] colls: Optional[List[str]] = []
tags: Optional[List[str]] = []
# ============================================================================ # ============================================================================
@ -358,6 +360,7 @@ class CrawlOps:
scale=crawlconfig.scale, scale=crawlconfig.scale,
manual=True, manual=True,
started=ts_now(), started=ts_now(),
tags=crawlconfig.tags,
) )
try: try:

View File

@ -70,6 +70,9 @@ spec:
- name: CRAWL_CONFIG_ID - name: CRAWL_CONFIG_ID
value: "{{ cid }}" value: "{{ cid }}"
- name: TAGS
value: "{{ tags }}"
- name: STORE_PATH - name: STORE_PATH
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:

View File

@ -58,7 +58,11 @@ def admin_crawl_id(admin_auth_headers, admin_aid):
crawl_data = { crawl_data = {
"runNow": True, "runNow": True,
"name": "Admin Test Crawl", "name": "Admin Test Crawl",
"config": {"seeds": ["https://webrecorder.net/"], "limit": 1}, "tags": ["wr-test-1", "wr-test-2"],
"config": {
"seeds": ["https://webrecorder.net/"],
"limit": 1,
},
} }
r = requests.post( r = requests.post(
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/", f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/",

View File

@ -35,6 +35,7 @@ def test_rename_org(admin_auth_headers, admin_aid):
data = r.json() data = r.json()
assert data["name"] == UPDATED_NAME assert data["name"] == UPDATED_NAME
def test_create_org(admin_auth_headers): def test_create_org(admin_auth_headers):
NEW_ORG_NAME = "New Org" NEW_ORG_NAME = "New Org"
r = requests.post( r = requests.post(

View File

@ -46,28 +46,18 @@ def test_create_new_config(admin_auth_headers, admin_aid):
def test_wait_for_complete(admin_auth_headers, admin_aid, admin_crawl_id): def test_wait_for_complete(admin_auth_headers, admin_aid, admin_crawl_id):
print("")
print("---- Running Crawl ----")
while True:
r = requests.get( r = requests.get(
f"{API_PREFIX}/archives/{admin_aid}/crawls/{admin_crawl_id}/replay.json", f"{API_PREFIX}/archives/{admin_aid}/crawls/{admin_crawl_id}/replay.json",
headers=admin_auth_headers, headers=admin_auth_headers,
) )
data = r.json() data = r.json()
assert ( assert data["state"] == "complete"
data["state"] == "starting"
or data["state"] == "running"
or data["state"] == "complete"
), data["state"]
if data["state"] == "complete":
break
time.sleep(5)
assert len(data["resources"]) == 1 assert len(data["resources"]) == 1
assert data["resources"][0]["path"] assert data["resources"][0]["path"]
assert data["tags"] == ["wr-test-1", "wr-test-2"]
global wacz_path global wacz_path
global wacz_size global wacz_size
global wacz_hash global wacz_hash