| """ |
| Updated: supports large tables using LongTable + docx export. |
| Processor module. |
| Expose: generate_reports_from_csv(input_csv: str, out_dir: str) -> dict |
| Produces: out_dir/analysis_output.csv, out_dir/report.pdf, out_dir/report.docx (optional) |
| """ |
|
|
| import os,re,sys,csv,logging |
| from datetime import datetime |
| from pathlib import Path |
| import pandas as pd |
| import numpy as np |
| import matplotlib.pyplot as plt |
| from wordcloud import WordCloud, STOPWORDS |
| from transformers import pipeline |
| from sklearn.feature_extraction.text import CountVectorizer |
| from sklearn.decomposition import LatentDirichletAllocation |
| |
| |
| from reportlab.platypus import (SimpleDocTemplate, Paragraph, Spacer, PageBreak, |
| TableStyle, Image, LongTable) |
| from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle |
| from reportlab.lib import colors |
| from reportlab.lib.pagesizes import A4 |
| from reportlab.lib.units import inch |
| from reportlab.lib.enums import TA_LEFT |
|
|
| |
| DOCX_AVAILABLE = True |
| try: |
| from docx import Document |
| from docx.shared import Inches |
| except Exception: |
| DOCX_AVAILABLE = False |
|
|
| try: |
| import sentiment_analysis |
| except Exception as e: |
| raise RuntimeError(f"Failed to import sentiment_analysis.py: {e}") |
|
|
| logger = logging.getLogger("processor") |
| logger.setLevel(logging.INFO) |
|
|
| |
| CSV_ENCODING = "utf-8" |
| MAX_ROWS = None |
| TOPIC_COUNT = 3 |
|
|
| |
| TEASER_CHAR_LIMIT = 900 |
|
|
| |
| RELATIVE_TIME_RE = re.compile( |
| r'(?:(\d+)\s*(second|sec|s|minute|min|m|hour|hr|h|day|d|week|w|month|mo|year|yr|y)s?\s*ago)|\b(yesterday|today|just now|now)\b', |
| flags=re.IGNORECASE |
| ) |
|
|
| try: |
| import torch |
| device = 0 if torch.cuda.is_available() else -1 |
| except Exception: |
| device = -1 |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| def parse_relative_time(s: str, ref: pd.Timestamp): |
| if not isinstance(s, str) or s.strip() == "": |
| return pd.NaT |
| s = s.strip().lower() |
| if s in ("just now", "now"): |
| return ref |
| if s == "today": |
| return pd.Timestamp(ref.date()) |
| if s == "yesterday": |
| return ref - pd.Timedelta(days=1) |
| s = re.sub(r'\b(an|a)\b', '1', s) |
| m = re.search(r'(\d+)\s*(second|sec|s|minute|min|m|hour|hr|h|day|d|week|w|month|mo|year|yr|y)s?\s*ago', s) |
| if not m: |
| return pd.NaT |
| qty = int(m.group(1)); unit = m.group(2).lower() |
| if unit in ("second","sec","s"): return ref - pd.Timedelta(seconds=qty) |
| if unit in ("minute","min","m"): return ref - pd.Timedelta(minutes=qty) |
| if unit in ("hour","hr","h"): return ref - pd.Timedelta(hours=qty) |
| if unit in ("day","d"): return ref - pd.Timedelta(days=qty) |
| if unit in ("week","w"): return ref - pd.Timedelta(weeks=qty) |
| if unit in ("month","mo"): return ref - pd.Timedelta(days=qty * 30) |
| if unit in ("year","yr","y"): return ref - pd.Timedelta(days=qty * 365) |
| return pd.NaT |
|
|
| def clean_text(text: str) -> str: |
| if not isinstance(text, str): return "" |
| text = re.sub(r"http\S+", "", text) |
| text = re.sub(r"@\w+", "", text) |
| text = re.sub(r"#\w+", "", text) |
| text = re.sub(r"[^A-Za-z\s]", " ", text) |
| text = re.sub(r"\s+", " ", text) |
| return text.lower().strip() |
|
|
| def chunked(iterable, size): |
| for i in range(0, len(iterable), size): |
| yield iterable[i:i+size] |
|
|
|
|
| def teaser(s, n=TEASER_CHAR_LIMIT): |
| if not isinstance(s, str): return "" |
| s = s.strip() |
| return (s if len(s) <= n else s[:n-1].rsplit(" ",1)[0] + " ...") |
|
|
| def parse_score(x): |
| if pd.isna(x): return np.nan |
| s = str(x) |
| m = re.search(r"(-?\d+)", s.replace(",", "")) |
| if m: return int(m.group(1)) |
| nums = re.findall(r"\d+", s) |
| return int(nums[0]) if nums else np.nan |
|
|
| def parse_time_value(v,ref_ts): |
| if isinstance(v, (pd.Timestamp, datetime)): return pd.to_datetime(v) |
| if pd.isna(v): return pd.NaT |
| s = str(v).strip() |
| try: |
| parsed = pd.to_datetime(s, errors='coerce', utc=None) |
| if pd.notna(parsed): return parsed |
| except Exception: pass |
| rt = parse_relative_time(s, ref_ts) |
| if pd.notna(rt): return pd.to_datetime(rt) |
| return pd.NaT |
|
|
| def compile_list(lst): return [re.compile(pat, flags=re.IGNORECASE) for pat in lst] |
|
|
|
|
| |
| PRO_INDIA = [r"\bjai hind\b", r"\bvande mataram\b", r"\bpro india\b", r"\bpro-india\b", r"\bsupport (?:india|modi|bjp)\b", r"\bproud of india\b", r"\bindia is great\b"] |
| ANTI_INDIA = [r"\banti[- ]?india\b", r"\banti national\b", r"\btraitor\b", r"\banti-india\b", r"\bkill india\b", r"\bboycott india\b"] |
| CRITICAL_GOVT = [r"\bmodi sucks\b", r"\bcorrupt government\b", r"\bgovernment (?:is )?failing\b", r"\b(criticis|criticize|criticising) (?:government|modi|bjp)\b", r"\bpolicy (?:failure|fail)\b", r"\banti-corruption\b", r"\bmisgovern(ance|ing)\b", r"\bgovernment (?:policy|policies)"] |
| SUPPORT_OPPOSITION = [r"\bsupport (?:congress|aam aadmi|aap|opposition)\b", r"\bvot(e|ing) for .*opposition\b"] |
| SEPARATIST = [r"\bazadi\b", r"\bseparatist\b", r"\bsecede\b", r"\bindependence for\b"] |
| COMMUNAL = [r"\bcommunal\b", r"\breligious (?:tension|hatred)\b", r"\breligious\b", r"\bminority\b"] |
| CALL_TO_ACTION = [r"\bprotest\b", r"\bboycott\b", r"\bjoin (?:the )?protest\b", r"\bstrike\b", r"\brally\b", r"\baction\b"] |
| CONSPIRACY = [r"\bforeign funded\b", r"\bdeep state\b", r"\bconspiracy\b", r"\bwestern plot\b", r"\bcia\b", r"\bsecret agenda\b"] |
|
|
| PRO_INDIA_RE = compile_list(PRO_INDIA); ANTI_INDIA_RE = compile_list(ANTI_INDIA) |
| CRITICAL_GOVT_RE = compile_list(CRITICAL_GOVT); SUPPORT_OPPOSITION_RE = compile_list(SUPPORT_OPPOSITION) |
| SEPARATIST_RE = compile_list(SEPARATIST); COMMUNAL_RE = compile_list(COMMUNAL) |
| CALL_TO_ACTION_RE = compile_list(CALL_TO_ACTION); CONSPIRACY_RE = compile_list(CONSPIRACY) |
|
|
|
|
| def text_matches_any(text, patterns): |
| for pat in patterns: |
| if pat.search(text or ""): return True |
| return False |
|
|
| def determine_nature(text, sentiment_label): |
| t = (text or "").lower() |
| |
| if text_matches_any(t, SEPARATIST_RE): return "separatist" |
| if text_matches_any(t, CALL_TO_ACTION_RE): return "call-to-action" |
| if text_matches_any(t, COMMUNAL_RE): return "communal" |
| if text_matches_any(t, CONSPIRACY_RE): return "conspiratorial" |
|
|
| |
| s = str(sentiment_label) |
| if s == "Pro-India": return "pro-india" |
| if s == "Anti-India": return "anti-india" |
| if s == "Pro-Government": return "pro-government" |
| if s == "Anti-Government": return "anti-government" |
|
|
| |
| if text_matches_any(t, ANTI_INDIA_RE): return "anti-india" |
| if text_matches_any(t, PRO_INDIA_RE): return "pro-india" |
| if text_matches_any(t, CRITICAL_GOVT_RE): return "critical-of-government" |
| if text_matches_any(t, SUPPORT_OPPOSITION_RE): return "supportive-of-opposition" |
|
|
| |
| s_upper = s.upper() |
| if "POS" in s_upper: return "supportive" |
| if "NEG" in s_upper: return "critical" |
| |
| return "neutral" |
|
|
| |
| danger_keywords = ["kill","attack","bomb","violence","terror","terrorist","militant", |
| "insurgency","boycott","protest","call to action"] |
| pattern = re.compile(r'\b(?:' + '|'.join(map(re.escape, danger_keywords)) + r')\b', |
| flags=re.IGNORECASE) |
|
|
| def is_dangerous(text, sentiment): |
| |
| return (str(sentiment).upper() == "ANTI-INDIA" and text.strip() != "") |
|
|
| def generate_reports_from_csv(input_csv:str, out_dir:str) -> dict: |
| """ |
| Runs full analysis pipeline. Returns dict: {'pdf':..., 'csv':..., 'docx':...} |
| """ |
| logger.info("Running processing pipeline on %s",input_csv) |
| out_dir= Path(out_dir) |
| out_dir.mkdir(parents=True,exist_ok=True) |
|
|
| |
| if not os.path.exists(input_csv): |
| print("CSV file not found:", input_csv); sys.exit(1) |
|
|
| print("Loading CSV:", input_csv) |
| try: |
| df_raw = pd.read_csv(input_csv, encoding=CSV_ENCODING, low_memory=False) |
| except Exception as e: |
| print("Error reading CSV:", e); sys.exit(1) |
|
|
| if MAX_ROWS: |
| df_raw = df_raw.head(MAX_ROWS) |
|
|
| title_col = "Title" |
| reference_col = "Reference" |
| subreddit_col = "Subreddit" |
| score_col = "Score" |
| comment_col = "Comments" |
| time_col = "Time" |
| author_col = "Author" |
| desc_col = "Description" |
| url_col = "Url" |
|
|
| if not any(c in df_raw.columns for c in [title_col, comment_col, desc_col]): |
| print("No text column detected. CSV columns:", list(df_raw.columns)); sys.exit(1) |
|
|
| |
| |
| |
| df = pd.DataFrame() |
| df["orig_index"] = df_raw.index.astype(str) |
| df["title"] = df_raw[title_col].fillna("").astype(str) if title_col else "" |
| df["reference"] = df_raw[reference_col].astype(str) if reference_col else "" |
| df["subreddit"] = df_raw[subreddit_col] if subreddit_col else "N/A" |
| df["raw_score"] = df_raw[score_col] if score_col else np.nan |
| df["comment"] = df_raw[comment_col].fillna("").astype(str) if comment_col else "" |
| df["time_raw"] = df_raw[time_col] if time_col else "" |
| df["username"] = df_raw[author_col] if author_col else "N/A" |
| df["description"] = df_raw[desc_col].fillna("").astype(str) if desc_col else "" |
| df["url"] = df_raw[url_col] if url_col else "" |
| |
| df["text_for_analysis"] = (df["title"] + " " + df["comment"] + " " + df["description"]).str.strip() |
| df.loc[df["text_for_analysis"].str.strip() == "", "text_for_analysis"] = df.loc[df["text_for_analysis"].str.strip() == "", :].apply( |
| lambda r: " ".join([str(v) for v in r.values if isinstance(v, str) and v.strip() != ""]), axis=1 |
| ) |
| df["clean_text"] = df["text_for_analysis"].apply(clean_text) |
| df["score"] = df["raw_score"].apply(parse_score) |
|
|
| |
| try: |
| ref_ts = pd.to_datetime(os.path.getmtime(input_csv), unit='s') |
| except Exception: |
| ref_ts = pd.Timestamp.now() |
| |
| df["created_at"] = df["time_raw"].apply(lambda x: parse_time_value(x,ref_ts)) |
|
|
| |
| print("Loading sentiment model...") |
| |
| sentiment_analysis.init_anchors() |
|
|
| texts = df["clean_text"].tolist() |
| preds = [] |
| |
| for text in texts: |
| out = sentiment_analysis.classify(text) |
| |
| |
| if "error" in out: |
| preds.append(("NEUTRAL", 0.0)) |
| else: |
| label = out.get("label", "NEUTRAL") |
| score = float(out.get("confidence", 0.0)) |
| preds.append((label, score)) |
|
|
| df["sentiment"] = [p[0] for p in preds] |
| df["sentiment_score"] = [p[1] for p in preds] |
| |
| df["nature"] = [ |
| determine_nature(text, sentiment) |
| for text, sentiment in zip(df["clean_text"], df["sentiment"]) |
| ] |
|
|
| |
| print("Performing topic modeling...") |
|
|
| vectorizer = CountVectorizer(stop_words="english", min_df=2) |
| try: |
| X = vectorizer.fit_transform(df["clean_text"]) |
| except Exception as e: |
| print("Topic vectorization failed:", e); X = None |
|
|
| if X is None or X.shape[0] < 3 or len(vectorizer.get_feature_names_out()) < 5: |
| df["topic"] = np.nan |
| topic_counts = pd.Series(dtype=int) |
| else: |
| n_topics = min(TOPIC_COUNT, X.shape[0]) |
| lda = LatentDirichletAllocation(n_components=n_topics, random_state=42) |
| lda.fit(X) |
| doc_topic = lda.transform(X) |
| df["topic"] = doc_topic.argmax(axis=1) |
| topic_counts = df["topic"].value_counts().sort_index() |
|
|
| df["dangerous"] = df.apply(lambda r: is_dangerous(r["clean_text"], r["sentiment"]), axis=1) |
| dangerous_tweets = df[df["dangerous"]].copy() |
| print(f"Flagged {len(dangerous_tweets)} potentially dangerous posts.") |
|
|
| |
| try: |
| |
| sent_counts = df["sentiment"].value_counts() |
| plt.figure(figsize=(6,4)) |
| sent_counts.plot(kind="bar") |
| plt.title("Sentiment Distribution") |
| plt.tight_layout() |
| plt.savefig(out_dir / "sentiment.png", dpi=150) |
| plt.close() |
| |
| if "topic" in df and df["topic"].notna().any(): |
| topic_counts = df["topic"].value_counts().sort_index() |
| plt.figure(figsize=(6,4)) |
| topic_counts.plot(kind="bar") |
| plt.title("Topic Distribution") |
| plt.tight_layout() |
| plt.savefig(out_dir / "topics.png", dpi=150) |
| plt.close() |
| |
| dangerous_df = df[df["dangerous"]] |
| if not dangerous_df.empty: |
| wc_text = " ".join(dangerous_df["clean_text"].tolist()) |
| wc = WordCloud(width=1000, height=400, background_color="white", stopwords=set(STOPWORDS)).generate(wc_text) |
| plt.figure(figsize=(12,5)) |
| plt.imshow(wc, interpolation="bilinear") |
| plt.axis("off") |
| plt.tight_layout() |
| plt.savefig(out_dir / "danger_wc.png", dpi=150) |
| plt.close() |
| except Exception as e: |
| logger.warning("Visuals generation failed: %s", e) |
|
|
|
|
| |
| print("Building PDF report (LongTable for large tables)...") |
| pdf_out= out_dir/"report.pdf" |
| styles = getSampleStyleSheet() |
| styleN = styles["Normal"] |
| styleH = styles["Heading2"] |
| title_style = styles["Title"] |
| tweet_paragraph_style = ParagraphStyle("TweetStyle", parent=styles["BodyText"], fontSize=9, leading=11, spaceAfter=6, alignment=TA_LEFT) |
|
|
| doc = SimpleDocTemplate(pdf_out, pagesize=A4, rightMargin=36, leftMargin=36, topMargin=36, bottomMargin=36) |
| elements = [] |
| elements.append(Paragraph("Reddit Posts Report (CSV Source) — India-specific Nature", title_style)) |
| elements.append(Spacer(1, 8)) |
| elements.append(Paragraph(f"Total Posts Processed: {len(df)}", styleN)) |
| elements.append(Spacer(1, 8)) |
|
|
| |
| elements.append(Paragraph("Sentiment Analysis Summary", styleH)) |
| total = len(df) |
| for label, count in sent_counts.items(): |
| pct = count / total * 100 if total > 0 else 0 |
| elements.append(Paragraph(f"{label}: {count} posts ({pct:.1f}%)", styleN)) |
| elements.append(Spacer(1, 6)) |
| if os.path.exists("sentiment.png"): |
| elements.append(Image("sentiment.png", width=5.5*inch, height=3*inch)) |
| elements.append(Spacer(1, 12)) |
|
|
| |
| if not topic_counts.empty: |
| elements.append(Paragraph("Topic Modeling Summary", styleH)) |
| for idx, val in topic_counts.items(): |
| elements.append(Paragraph(f"Topic {int(idx)}: {int(val)} posts", styleN)) |
| elements.append(Spacer(1, 6)) |
| if os.path.exists("topics.png"): elements.append(Image("topics.png", width=5.5*inch, height=3*inch)) |
| elements.append(Spacer(1, 12)) |
|
|
| elements.append(Paragraph("Nature (India-specific) Summary", styleH)) |
| nature_counts = df["nature"].value_counts() |
| for label, count in nature_counts.items(): |
| pct = count / total * 100 if total > 0 else 0 |
| elements.append(Paragraph(f"{label}: {count} posts ({pct:.1f}%)", styleN)) |
| elements.append(Spacer(1, 12)) |
|
|
| |
| elements.append(Paragraph("Flagged Potentially Dangerous Posts", styleH)) |
| elements.append(Spacer(1, 6)) |
| if dangerous_tweets.empty: |
| elements.append(Paragraph("No dangerous posts detected.", styleN)) |
| else: |
| |
| header = ["Post (teaser)", "Subreddit", "Author", "Sentiment", "Nature", "Topic", "Date"] |
| lt_data = [header] |
| for _, row in dangerous_tweets.iterrows(): |
| date_str = row["created_at"].strftime("%Y-%m-%d %H:%M") if pd.notna(row["created_at"]) else "N/A" |
| lt_data.append([ |
| Paragraph(teaser(row["text_for_analysis"], TEASER_CHAR_LIMIT), tweet_paragraph_style), |
| row["subreddit"] if pd.notna(row["subreddit"]) else "N/A", |
| row["username"] if pd.notna(row["username"]) else "N/A", |
| row["sentiment"], |
| row["nature"], |
| str(int(row["topic"])) if not pd.isna(row["topic"]) else "N/A", |
| date_str |
| ]) |
| col_widths = [3.0*inch, 0.7*inch, 0.8*inch, 0.6*inch, 0.8*inch, 0.5*inch, 1.0*inch] |
| lt = LongTable(lt_data, colWidths=col_widths, repeatRows=1) |
| |
| lt_style = TableStyle([ |
| ('BACKGROUND', (0,0), (-1,0), colors.HexColor("#4F81BD")), |
| ('TEXTCOLOR', (0,0), (-1,0), colors.whitesmoke), |
| ('ALIGN', (1,0), (-1,-1), 'CENTER'), |
| ('VALIGN', (0,0), (-1,-1), 'TOP'), |
| ('GRID', (0,0), (-1,-1), 0.25, colors.grey), |
| ('FONTNAME', (0,0), (-1,0), 'Helvetica-Bold'), |
| ('FONTSIZE', (0,0), (-1,-1), 8), |
| ('LEFTPADDING', (0,0), (-1,-1), 4), |
| ('RIGHTPADDING', (0,0), (-1,-1), 4), |
| ]) |
| lt.setStyle(lt_style) |
| elements.append(lt) |
| elements.append(Spacer(1, 12)) |
| if os.path.exists("danger_wc.png"): |
| elements.append(Paragraph("Word Cloud of Flagged Posts", styleH)); elements.append(Image("danger_wc.png", width=5.5*inch, height=2.6*inch)) |
|
|
| elements.append(PageBreak()) |
|
|
| |
| elements.append(Paragraph("All Collected Posts", styles['Heading2'])) |
| all_header = ["Date", "Subreddit", "Author", "Score", "Nature", "Post (teaser)"] |
| all_lt_data = [all_header] |
| for idx, row in df.iterrows(): |
| date_str = row["created_at"].strftime("%Y-%m-%d %H:%M") if pd.notna(row["created_at"]) else "N/A" |
| all_lt_data.append([ |
| date_str, |
| row["subreddit"] if pd.notna(row["subreddit"]) else "N/A", |
| row["username"] if pd.notna(row["username"]) else "N/A", |
| str(row["score"]) if not pd.isna(row["score"]) else "N/A", |
| row["nature"], |
| Paragraph(teaser(row["text_for_analysis"], TEASER_CHAR_LIMIT), tweet_paragraph_style) |
| ]) |
|
|
| all_col_widths = [1.0*inch, 1.0*inch, 1.0*inch, 0.7*inch, 0.9*inch, 2.8*inch] |
| all_lt = LongTable(all_lt_data, colWidths=all_col_widths, repeatRows=1) |
| all_lt.setStyle(TableStyle([ |
| ('BACKGROUND', (0,0), (-1,0), colors.HexColor("#4F81BD")), |
| ('TEXTCOLOR', (0,0), (-1,0), colors.whitesmoke), |
| ('GRID', (0,0), (-1,-1), 0.25, colors.grey), |
| ('VALIGN', (0,0), (-1,-1), 'TOP'), |
| ('FONTSIZE', (0,0), (-1,-1), 8), |
| ('LEFTPADDING', (0,0), (-1,-1), 4), |
| ('RIGHTPADDING', (0,0), (-1,-1), 4), |
| ])) |
| elements.append(all_lt) |
|
|
| |
| doc = SimpleDocTemplate(str(pdf_out)) |
| doc.build(elements) |
| print("✅ PDF saved as:", pdf_out) |
|
|
| |
| csv_out = out_dir/"analysis_output.csv" |
| df_out = df.copy() |
| df_out["created_at_str"] = df_out["created_at"].apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S") if pd.notna(x) else "") |
| |
| import time |
| for attempt in range(3): |
| try: |
| df_out.to_csv(csv_out, index=False, encoding="utf-8") |
| print("✅ Enriched CSV saved as:", csv_out) |
| break |
| except PermissionError: |
| if attempt < 2: |
| print(f"⚠️ Permission denied saving CSV (file locked?). Retrying {attempt+1}/3 in 1s...") |
| time.sleep(1) |
| else: |
| print("❌ FAILED to save CSV. The file is likely open in another program (Excel/VS Code).") |
| |
| |
|
|
|
|
|
|
| |
| if not DOCX_AVAILABLE: |
| print("python-docx not installed — skipping DOCX export. Install via: pip install python-docx") |
| else: |
| try: |
| print("Building DOCX report...") |
| DOCX_OUTPUT= out_dir/"report.docx" |
| docx = Document() |
| docx.add_heading("Reddit Posts Report (India-specific Nature)", level=1) |
| docx.add_paragraph(f"Total Posts Processed: {len(df)}") |
| docx.add_heading("Sentiment Analysis Summary", level=2) |
| for label, count in sent_counts.items(): |
| pct = count / total * 100 if total > 0 else 0 |
| docx.add_paragraph(f"{label}: {count} posts ({pct:.1f}%)") |
|
|
| docx.add_heading("Nature Summary", level=2) |
| for label, count in nature_counts.items(): |
| pct = count / total * 100 if total > 0 else 0 |
| docx.add_paragraph(f"{label}: {count} posts ({pct:.1f}%)") |
|
|
| |
| sample_n = min(200, len(df)) |
| docx.add_heading(f"Sample of First {sample_n} Posts", level=2) |
| table = docx.add_table(rows=1, cols=6) |
| hdr_cells = table.rows[0].cells |
| hdr_cells[0].text = "Date" |
| hdr_cells[1].text = "Subreddit" |
| hdr_cells[2].text = "Author" |
| hdr_cells[3].text = "Score" |
| hdr_cells[4].text = "Nature" |
| hdr_cells[5].text = "Post (teaser)" |
| for idx, row in df.head(sample_n).iterrows(): |
| row_cells = table.add_row().cells |
| date_str = row["created_at"].strftime("%Y-%m-%d %H:%M") if pd.notna(row["created_at"]) else "N/A" |
| row_cells[0].text = date_str |
| row_cells[1].text = str(row["subreddit"]) if pd.notna(row["subreddit"]) else "N/A" |
| row_cells[2].text = str(row["username"]) if pd.notna(row["username"]) else "N/A" |
| row_cells[3].text = str(row["score"]) if not pd.isna(row["score"]) else "N/A" |
| row_cells[4].text = str(row["nature"]) |
| row_cells[5].text = teaser(row["text_for_analysis"], 300) |
|
|
| docx.save(DOCX_OUTPUT) |
| print("✅ DOCX saved as:", DOCX_OUTPUT) |
| except Exception as e: |
| logger.exception("DOCX creation failed: %s", e) |
| if DOCX_OUTPUT.exists(): |
| try: |
| DOCX_OUTPUT.unlink(missing_ok=True) |
| except Exception: |
| pass |
| logger.info("Processor: finished, files at %s", out_dir) |
| return {"pdf": str(pdf_out), "csv": str(csv_out), "docx": str(DOCX_OUTPUT) if DOCX_OUTPUT.exists() else ""} |
| |
|
|