* backend: make crawlconfigs mutable! (#656) - crawlconfig PATCH /{id} can now receive a new JSON config to replace the old one (in addition to scale, schedule, tags) - exclusions: add / remove APIs mutate the current crawlconfig, do not result in a new crawlconfig created - exclusions: ensure crawl job 'config' is updated when exclusions are added/removed, unify add/remove exclusions on crawl - k8s: crawlconfig json is updated along with scale - k8s: stateful set is restarted by updating annotation, instead of changing template - crawl object: now has 'config', as well as 'profileid', 'schedule', 'crawlTimeout', 'jobType' properties to ensure anything that is changeable is stored on the crawl - crawlconfigcore: store share properties between crawl and crawlconfig in new crawlconfigcore (includes 'schedule', 'jobType', 'config', 'profileid', 'schedule', 'crawlTimeout', 'tags', 'oid') - crawlconfig object: remove 'oldId', 'newId', disallow deactivating/deleting while crawl is running - rename 'userid' -> 'createdBy' - remove unused 'completions' field - add missing return to fix /run response - crawlout: ensure 'profileName' is resolved on CrawlOut from profileid - crawlout: return 'name' instead of 'configName' for consistent response - update: 'modified', 'modifiedBy' fields to set modification date and user modifying config - update: ensure PROFILE_FILENAME is updated in configmap is profileid provided, clear if profileid=="" - update: return 'settings_changed' and 'metadata_changed' if either crawl settings or metadata changed - tests: update tests to check settings_changed/metadata_changed return values add revision tracking to crawlconfig: - store each revision separate mongo db collection - revisions accessible via /crawlconfigs/{cid}/revs - store 'rev' int in crawlconfig and in crawljob - only add revision history if crawl config changed migration: - update to db v3 - copy fields from crawlconfig -> crawl - rename userid -> createdBy - copy userid -> modifiedBy, created -> modified - skip invalid crawls (missing config), make createdBy optional (just in case) frontend: Update crawl config keys with new API (#681), update frontend to use new PATCH endpoint, load config from crawl object in details view --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net> Co-authored-by: sua yoo <sua@webrecorder.org> Co-authored-by: sua yoo <sua@suayoo.com>
113 lines
2.7 KiB
YAML
113 lines
2.7 KiB
YAML
apiVersion: batch/v1
|
|
kind: Job
|
|
metadata:
|
|
name: job-{{ id }}
|
|
annotations:
|
|
btrix.run.manual: "{{ manual }}"
|
|
|
|
labels:
|
|
btrix.user: {{ userid }}
|
|
btrix.org: {{ oid }}
|
|
btrix.crawlconfig: {{ cid }}
|
|
|
|
spec:
|
|
backoffLimit: 1000
|
|
ttlSecondsAfterFinished: 20
|
|
template:
|
|
metadata:
|
|
labels:
|
|
btrix.user: {{ userid }}
|
|
btrix.org: {{ oid }}
|
|
btrix.crawlconfig: {{ cid }}
|
|
spec:
|
|
restartPolicy: OnFailure
|
|
affinity:
|
|
nodeAffinity:
|
|
preferredDuringSchedulingIgnoredDuringExecution:
|
|
- weight: 1
|
|
preference:
|
|
matchExpressions:
|
|
- key: nodeType
|
|
operator: In
|
|
values:
|
|
- "{{ crawler_node_type }}"
|
|
|
|
tolerations:
|
|
- key: "nodeType"
|
|
operator: "Equal"
|
|
value: "crawling"
|
|
effect: "NoSchedule"
|
|
|
|
containers:
|
|
- name: crawl-job
|
|
image: {{ job_image }}
|
|
imagePullPolicy: {{ job_pull_policy }}
|
|
command: ["uvicorn", "btrixcloud.k8s.crawl_job:app", "--host", "0.0.0.0", "--access-log", "--log-level", "info"]
|
|
|
|
volumeMounts:
|
|
- name: config-volume
|
|
mountPath: /config
|
|
|
|
envFrom:
|
|
- secretRef:
|
|
name: mongo-auth
|
|
|
|
env:
|
|
- name: JOB_ID
|
|
valueFrom:
|
|
fieldRef:
|
|
fieldPath: metadata.labels['job-name']
|
|
|
|
- name: RUN_MANUAL
|
|
value: "{{ manual }}"
|
|
|
|
- name: USER_ID
|
|
value: "{{ userid }}"
|
|
|
|
- name: ORG_ID
|
|
value: "{{ oid }}"
|
|
|
|
- name: CRAWL_CONFIG_ID
|
|
value: "{{ cid }}"
|
|
|
|
- name: REV
|
|
value: "{{ rev }}"
|
|
|
|
- name: TAGS
|
|
value: "{{ tags }}"
|
|
|
|
- name: STORE_PATH
|
|
valueFrom:
|
|
configMapKeyRef:
|
|
name: crawl-config-{{ cid }}
|
|
key: STORE_PATH
|
|
|
|
- name: STORE_FILENAME
|
|
valueFrom:
|
|
configMapKeyRef:
|
|
name: crawl-config-{{ cid }}
|
|
key: STORE_FILENAME
|
|
|
|
- name: STORAGE_NAME
|
|
valueFrom:
|
|
configMapKeyRef:
|
|
name: crawl-config-{{ cid }}
|
|
key: STORAGE_NAME
|
|
|
|
- name: PROFILE_FILENAME
|
|
valueFrom:
|
|
configMapKeyRef:
|
|
name: crawl-config-{{ cid }}
|
|
key: PROFILE_FILENAME
|
|
|
|
volumes:
|
|
- name: config-volume
|
|
configMap:
|
|
name: shared-job-config
|
|
items:
|
|
- key: config.yaml
|
|
path: config.yaml
|
|
|
|
|
|
|