Fixes #2406 Converts migration 0042 to launch a background job (parallelized across several pods) to migrate all crawls by optimizing their pages and setting `version: 2` on the crawl when complete. Also Optimizes MongoDB queries for better performance. Migration Improvements: - Add `isMigrating` and `version` fields to `BaseCrawl` - Add new background job type to use in migration with accompanying `migration_job.yaml` template that allows for parallelization - Add new API endpoint to launch this crawl migration job, and ensure that we have list and retry endpoints for superusers that work with background jobs that aren't tied to a specific org - Rework background job models and methods now that not all background jobs are tied to a single org - Ensure new crawls and uploads have `version` set to `2` - Modify crawl and collection replay.json endpoints to only include fields for replay optimization (`initialPages`, `pageQueryUrl`, `preloadResources`) if all relevant crawls/uploads have `version` set to `2` - Remove `distinct` calls from migration pathways - Consolidate collection recompute stats Query Optimizations: - Remove all uses of $group and $facet - Optimize /replay.json endpoints to precompute preload_resources, avoid fetching crawl list twice - Optimize /collections endpoint by not fetching resources - Rename /urls -> /pageUrlCounts and avoid $group, instead sort with index, either by seed + ts or by url to get top matches. - Use $gte instead of $regex to get prefix matches on URL - Use $text instead of $regex to get text search on title - Remove total from /pages and /pageUrlCounts queries by not using $facet - frontend: only call /pageUrlCounts when dialog is opened. --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com> Co-authored-by: Emma Segal-Grossman <hi@emma.cafe> Co-authored-by: Ilya Kreymer <ikreymer@users.noreply.github.com>
		
			
				
	
	
		
			189 lines
		
	
	
		
			5.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			189 lines
		
	
	
		
			5.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| """Basic Email Sending Support"""
 | |
| 
 | |
| from datetime import datetime
 | |
| import os
 | |
| import smtplib
 | |
| import ssl
 | |
| from uuid import UUID
 | |
| from typing import Optional, Union
 | |
| 
 | |
| from email.message import EmailMessage
 | |
| from email.mime.text import MIMEText
 | |
| from email.mime.multipart import MIMEMultipart
 | |
| from fastapi import HTTPException
 | |
| from fastapi.templating import Jinja2Templates
 | |
| 
 | |
| from .models import CreateReplicaJob, DeleteReplicaJob, Organization, InvitePending
 | |
| from .utils import is_bool, get_origin
 | |
| 
 | |
| 
 | |
| # pylint: disable=too-few-public-methods, too-many-instance-attributes
 | |
| class EmailSender:
 | |
|     """SMTP Email Sender"""
 | |
| 
 | |
|     sender: str
 | |
|     password: str
 | |
|     reply_to: str
 | |
|     smtp_server: Optional[str]
 | |
|     smtp_port: int
 | |
|     smtp_use_tls: bool
 | |
|     support_email: str
 | |
|     survey_url: str
 | |
| 
 | |
|     templates: Jinja2Templates
 | |
| 
 | |
|     log_sent_emails: bool
 | |
| 
 | |
|     def __init__(self):
 | |
|         self.sender = os.environ.get("EMAIL_SENDER") or "Browsertrix admin"
 | |
|         self.password = os.environ.get("EMAIL_PASSWORD") or ""
 | |
|         self.reply_to = os.environ.get("EMAIL_REPLY_TO") or self.sender
 | |
|         self.support_email = os.environ.get("EMAIL_SUPPORT") or self.reply_to
 | |
|         self.survey_url = os.environ.get("USER_SURVEY_URL") or ""
 | |
|         self.smtp_server = os.environ.get("EMAIL_SMTP_HOST")
 | |
|         self.smtp_port = int(os.environ.get("EMAIL_SMTP_PORT", 587))
 | |
|         self.smtp_use_tls = is_bool(os.environ.get("EMAIL_SMTP_USE_TLS"))
 | |
| 
 | |
|         self.log_sent_emails = is_bool(os.environ.get("LOG_SENT_EMAILS"))
 | |
| 
 | |
|         self.templates = Jinja2Templates(
 | |
|             directory=os.path.join(os.path.dirname(__file__), "email-templates")
 | |
|         )
 | |
| 
 | |
|     def _send_encrypted(self, receiver: str, name: str, **kwargs) -> None:
 | |
|         """Send Encrypted SMTP Message using given template name"""
 | |
| 
 | |
|         full = self.templates.env.get_template(name).render(kwargs)
 | |
|         parts = full.split("~~~")
 | |
|         if len(parts) == 3:
 | |
|             subject, html, text = parts
 | |
|         elif len(parts) == 2:
 | |
|             subject, text = parts
 | |
|             html = None
 | |
|         else:
 | |
|             raise HTTPException(status_code=500, detail="invalid_email_template")
 | |
| 
 | |
|         if self.log_sent_emails:
 | |
|             print(full, flush=True)
 | |
| 
 | |
|         if not self.smtp_server:
 | |
|             print(
 | |
|                 f'Email: created "{name}" msg for "{receiver}", but not sent (no SMTP server set)',
 | |
|                 flush=True,
 | |
|             )
 | |
|             return
 | |
| 
 | |
|         msg: Union[EmailMessage, MIMEMultipart]
 | |
| 
 | |
|         if html:
 | |
|             msg = MIMEMultipart("alternative")
 | |
|             msg.attach(MIMEText(text.strip(), "plain"))
 | |
|             msg.attach(MIMEText(html.strip(), "html"))
 | |
|         else:
 | |
|             msg = EmailMessage()
 | |
|             msg.set_content(text.strip())
 | |
| 
 | |
|         msg["Subject"] = subject.strip()
 | |
|         msg["From"] = self.reply_to
 | |
|         msg["To"] = receiver
 | |
|         msg["Reply-To"] = msg["From"]
 | |
| 
 | |
|         context = ssl.create_default_context()
 | |
|         with smtplib.SMTP(self.smtp_server, self.smtp_port) as server:
 | |
|             if self.smtp_use_tls:
 | |
|                 server.ehlo()
 | |
|                 server.starttls(context=context)
 | |
|             server.ehlo()
 | |
|             if self.password:
 | |
|                 server.login(self.sender, self.password)
 | |
|             server.send_message(msg)
 | |
|             # server.sendmail(self.sender, receiver, message)
 | |
| 
 | |
|     def send_user_validation(
 | |
|         self, receiver_email: str, token: str, headers: Optional[dict] = None
 | |
|     ):
 | |
|         """Send email to validate registration email address"""
 | |
| 
 | |
|         origin = get_origin(headers)
 | |
| 
 | |
|         self._send_encrypted(receiver_email, "validate", origin=origin, token=token)
 | |
| 
 | |
|     # pylint: disable=too-many-arguments
 | |
|     def send_user_invite(
 | |
|         self,
 | |
|         invite: InvitePending,
 | |
|         token: UUID,
 | |
|         org_name: str,
 | |
|         is_new: bool,
 | |
|         headers: Optional[dict] = None,
 | |
|     ):
 | |
|         """Send email to invite new user"""
 | |
| 
 | |
|         origin = get_origin(headers)
 | |
| 
 | |
|         receiver_email = invite.email or ""
 | |
| 
 | |
|         invite_url = (
 | |
|             f"{origin}/join/{token}?email={receiver_email}"
 | |
|             if is_new
 | |
|             else f"{origin}/invite/accept/{token}?email={receiver_email}"
 | |
|         )
 | |
| 
 | |
|         self._send_encrypted(
 | |
|             receiver_email,
 | |
|             "invite",
 | |
|             invite_url=invite_url,
 | |
|             is_new=is_new,
 | |
|             sender=invite.inviterEmail if not invite.fromSuperuser else "",
 | |
|             org_name=org_name,
 | |
|             support_email=self.support_email,
 | |
|         )
 | |
| 
 | |
|     def send_user_forgot_password(self, receiver_email, token, headers=None):
 | |
|         """Send password reset email with token"""
 | |
|         origin = get_origin(headers)
 | |
| 
 | |
|         self._send_encrypted(
 | |
|             receiver_email,
 | |
|             "password_reset",
 | |
|             origin=origin,
 | |
|             token=token,
 | |
|             support_email=self.support_email,
 | |
|         )
 | |
| 
 | |
|     def send_background_job_failed(
 | |
|         self,
 | |
|         job: Union[CreateReplicaJob, DeleteReplicaJob],
 | |
|         finished: datetime,
 | |
|         receiver_email: str,
 | |
|         org: Optional[Organization] = None,
 | |
|     ):
 | |
|         """Send background job failed email to superuser"""
 | |
|         self._send_encrypted(
 | |
|             receiver_email, "failed_bg_job", job=job, org=org, finished=finished
 | |
|         )
 | |
| 
 | |
|     def send_subscription_will_be_canceled(
 | |
|         self,
 | |
|         cancel_date: datetime,
 | |
|         user_name: str,
 | |
|         receiver_email: str,
 | |
|         org: Organization,
 | |
|         headers=None,
 | |
|     ):
 | |
|         """Send email indicating subscription is cancelled and all org data will be deleted"""
 | |
| 
 | |
|         origin = get_origin(headers)
 | |
|         org_url = f"{origin}/orgs/{org.slug}/"
 | |
| 
 | |
|         self._send_encrypted(
 | |
|             receiver_email,
 | |
|             "sub_cancel",
 | |
|             org_url=org_url,
 | |
|             user_name=user_name,
 | |
|             org_name=org.name,
 | |
|             cancel_date=cancel_date,
 | |
|             support_email=self.support_email,
 | |
|             survey_url=self.survey_url,
 | |
|         )
 |