Copy tags from crawlconfig to crawl (#467), fixes #466

- add tags to crawl object
- ensure tags are copied from crawlconfig to crawl when crawl is created (both manually and scheduled)
- tests: add test to ensure tags added to crawl, remove redundant wait replaced with fixtures
This commit is contained in:
Ilya Kreymer 2023-01-12 17:46:19 -08:00 committed by GitHub
parent 49460bb070
commit 2daa742585
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 24 additions and 20 deletions

View File

@ -47,6 +47,7 @@ class CrawlJob(ABC):
self.userid = uuid.UUID(os.environ["USER_ID"])
self.is_manual = os.environ.get("RUN_MANUAL") == "1"
self.tags = os.environ.get("TAGS", "").split(",")
self.scale = int(os.environ.get("INITIAL_SCALE") or 0)
@ -360,6 +361,7 @@ class CrawlJob(ABC):
manual=self.is_manual,
scale=scale,
started=self.started,
tags=self.tags,
# colls=json.loads(job.metadata.annotations.get("btrix.colls", [])),
)

View File

@ -183,6 +183,7 @@ class BaseCrawlManager(ABC):
"schedule": schedule,
"env": os.environ,
"mongo_db_url": resolve_db_url(),
"tags": ",".join(crawlconfig.tags),
}
return self.templates.env.get_template("crawl_job.yaml").render(params)

View File

@ -84,6 +84,7 @@ class Crawl(BaseMongoModel):
files: Optional[List[CrawlFile]] = []
colls: Optional[List[str]] = []
tags: Optional[List[str]] = []
# ============================================================================
@ -121,6 +122,7 @@ class ListCrawlOut(BaseMongoModel):
fileCount: int = 0
colls: Optional[List[str]] = []
tags: Optional[List[str]] = []
# ============================================================================
@ -358,6 +360,7 @@ class CrawlOps:
scale=crawlconfig.scale,
manual=True,
started=ts_now(),
tags=crawlconfig.tags,
)
try:

View File

@ -70,6 +70,9 @@ spec:
- name: CRAWL_CONFIG_ID
value: "{{ cid }}"
- name: TAGS
value: "{{ tags }}"
- name: STORE_PATH
valueFrom:
configMapKeyRef:

View File

@ -58,7 +58,11 @@ def admin_crawl_id(admin_auth_headers, admin_aid):
crawl_data = {
"runNow": True,
"name": "Admin Test Crawl",
"config": {"seeds": ["https://webrecorder.net/"], "limit": 1},
"tags": ["wr-test-1", "wr-test-2"],
"config": {
"seeds": ["https://webrecorder.net/"],
"limit": 1,
},
}
r = requests.post(
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/",

View File

@ -14,7 +14,7 @@ def test_ensure_only_one_default_org(admin_auth_headers):
default_org_name = default_orgs[0]["name"]
orgs_with_same_name = [org for org in orgs if org["name"] == default_org_name]
assert len(orgs_with_same_name) == 1
def test_rename_org(admin_auth_headers, admin_aid):
UPDATED_NAME = "updated org name"
@ -35,6 +35,7 @@ def test_rename_org(admin_auth_headers, admin_aid):
data = r.json()
assert data["name"] == UPDATED_NAME
def test_create_org(admin_auth_headers):
NEW_ORG_NAME = "New Org"
r = requests.post(

View File

@ -46,28 +46,18 @@ def test_create_new_config(admin_auth_headers, admin_aid):
def test_wait_for_complete(admin_auth_headers, admin_aid, admin_crawl_id):
print("")
print("---- Running Crawl ----")
while True:
r = requests.get(
f"{API_PREFIX}/archives/{admin_aid}/crawls/{admin_crawl_id}/replay.json",
headers=admin_auth_headers,
)
data = r.json()
assert (
data["state"] == "starting"
or data["state"] == "running"
or data["state"] == "complete"
), data["state"]
if data["state"] == "complete":
break
time.sleep(5)
r = requests.get(
f"{API_PREFIX}/archives/{admin_aid}/crawls/{admin_crawl_id}/replay.json",
headers=admin_auth_headers,
)
data = r.json()
assert data["state"] == "complete"
assert len(data["resources"]) == 1
assert data["resources"][0]["path"]
assert data["tags"] == ["wr-test-1", "wr-test-2"]
global wacz_path
global wacz_size
global wacz_hash