Commit b098f6c0 authored by Andrey Filippov's avatar Andrey Filippov

Improved email scripts for Codex to use

parent e589ddf5
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
import email import email
import imaplib import imaplib
import os import os
import re
from email.header import decode_header from email.header import decode_header
from pathlib import Path from pathlib import Path
...@@ -38,6 +39,46 @@ def _write_index(index_path, values): ...@@ -38,6 +39,46 @@ def _write_index(index_path, values):
f.write(line + "\n") f.write(line + "\n")
def _decoded_text_parts(msg):
parts = []
if msg.is_multipart():
for part in msg.walk():
if part.get_content_type() != "text/plain":
continue
payload = part.get_payload(decode=True)
if payload is None:
continue
charset = part.get_content_charset() or "utf-8"
parts.append(payload.decode(charset, errors="replace"))
else:
payload = msg.get_payload(decode=True)
if payload is not None:
charset = msg.get_content_charset() or "utf-8"
parts.append(payload.decode(charset, errors="replace"))
return "\n".join(parts)
def _extract_original_headers(text):
if not text:
return "", "", ""
# Find forwarded blocks and keep the deepest match.
pattern = re.compile(
r"Forwarded message.*?From:\s*(?P<from>.+?)\r?\n"
r"(?:To:\s*.+?\r?\n)?"
r"Date:\s*(?P<date>.+?)\r?\n"
r"Subject:\s*(?P<subject>.+?)\r?\n",
re.IGNORECASE | re.DOTALL,
)
matches = list(pattern.finditer(text))
if not matches:
return "", "", ""
last = matches[-1]
original_from = " ".join(last.group("from").split())
original_date = " ".join(last.group("date").split())
original_subject = " ".join(last.group("subject").split())
return original_from, original_subject, original_date
def fetch(): def fetch():
if not IMAP_PASS: if not IMAP_PASS:
raise SystemExit("Set ELPHEL_IMAP_PASS") raise SystemExit("Set ELPHEL_IMAP_PASS")
...@@ -59,6 +100,8 @@ def fetch(): ...@@ -59,6 +100,8 @@ def fetch():
from_ = _decode(msg.get("From")) from_ = _decode(msg.get("From"))
to_ = _decode(msg.get("To")) to_ = _decode(msg.get("To"))
cc_ = _decode(msg.get("Cc")) cc_ = _decode(msg.get("Cc"))
text = _decoded_text_parts(msg)
original_from, original_subject, original_date = _extract_original_headers(text)
date_dir = OUT_DIR / (date[:10].replace(" ", "_") if date else "unknown-date") date_dir = OUT_DIR / (date[:10].replace(" ", "_") if date else "unknown-date")
date_dir.mkdir(parents=True, exist_ok=True) date_dir.mkdir(parents=True, exist_ok=True)
seq = f"{idx:04d}" seq = f"{idx:04d}"
...@@ -67,10 +110,20 @@ def fetch(): ...@@ -67,10 +110,20 @@ def fetch():
out_path.write_bytes(raw) out_path.write_bytes(raw)
meta_path = date_dir / f"{seq}_meta.txt" meta_path = date_dir / f"{seq}_meta.txt"
meta_path.write_text( meta_path.write_text(
f"From: {from_}\nTo: {to_}\nCc: {cc_}\nDate: {date}\nSubject: {subject}\n", f"From: {from_}\n"
f"To: {to_}\n"
f"Cc: {cc_}\n"
f"Date: {date}\n"
f"Subject: {subject}\n"
f"Original-From: {original_from}\n"
f"Original-Date: {original_date}\n"
f"Original-Subject: {original_subject}\n",
encoding="utf-8", encoding="utf-8",
) )
_write_index(OUT_DIR / "index.csv", [date, from_, to_, cc_, subject, str(out_path)]) _write_index(
OUT_DIR / "index.csv",
[date, from_, to_, cc_, subject, original_from, original_date, original_subject, str(out_path)],
)
if __name__ == "__main__": if __name__ == "__main__":
......
File mode changed from 100644 to 100755
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment