Spaces:

happypeople7410
/

visique-worker

Sleeping

App Files Files Community

Happy People commited on Feb 24

Commit

11d88a8

1 Parent(s): 30ae158

Deploying CPU-optimized Dolphin Worker from Visique

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Dockerfile +27 -13
app/__init__.py +98 -0
app/api/admin.py +606 -0
app/api/auth.py +273 -0
app/api/endpoints.py +830 -0
app/api/visilok.py +67 -0
app/core/__init__.py +52 -0
app/core/config.py +71 -0
app/core/database.py +46 -0
app/core/feature_registry.py +266 -0
app/core/migrations.py +111 -0
app/core/plan_config.py +192 -0
app/core/security.py +28 -0
app/core/stripe_config.py +29 -0
app/main.py +127 -0
app/models/feature_flags.py +59 -0
app/models/user.py +63 -0
app/schemas/chat.py +14 -0
app/schemas/financial.py +47 -0
app/schemas/user.py +82 -0
app/services/__init__.py +36 -0
app/services/analysis/__init__.py +54 -0
app/services/analysis/engine_lite.py +48 -0
app/services/analysis/factory.py +18 -0
app/services/analysis/fundamental.py +75 -0
app/services/analysis/growth.py +26 -0
app/services/analysis/health_score.py +46 -0
app/services/analysis/kpi.py +95 -0
app/services/analysis/registry.py +65 -0
app/services/analysis/risk.py +57 -0
app/services/analysis/simulation.py +67 -0
app/services/feature_service.py +306 -0
app/services/ingestion/__init__.py +57 -0
app/services/ingestion/doc_keywords.py +1408 -0
app/services/ingestion/dolphin/__init__.py +158 -0
app/services/ingestion/dolphin/classifier.py +278 -0
app/services/ingestion/dolphin/client.py +393 -0
app/services/ingestion/dolphin/extractor.py +336 -0
app/services/ingestion/dolphin/remote_client.py +110 -0
app/services/ingestion/keyword_learner.py +262 -0
app/services/ingestion/learned_keywords.json +1 -0
app/services/ingestion/mappings.py +389 -0
app/services/ingestion/parser_csv.py +139 -0
app/services/ingestion/parser_dolphin.py +471 -0
app/services/ingestion/parser_pdf.py +213 -0
app/services/ingestion/parser_xlsx.py +312 -0
app/services/ingestion/unified_parser.py +84 -0
app/services/intelligence/ai_cfo.py +52 -0
app/services/intelligence/gemini_service.py +238 -0
app/services/intelligence/geo_service.py +104 -0

Dockerfile CHANGED Viewed

@@ -1,23 +1,37 @@
-FROM nvidia/cuda:11.8.0-runtime-ubuntu22.04
-# System deps
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    python3 python3-pip poppler-utils git \
     && rm -rf /var/lib/apt/lists/*
-# Create non-root user (required by HF Spaces)
 RUN useradd -m -u 1000 user
 USER user
-ENV HOME=/home/user PATH="/home/user/.local/bin:$PATH"
 WORKDIR /home/user/app
-# Install Python deps first (layer caching)
-COPY --chown=user requirements.txt .
-RUN pip install --no-cache-dir --user -r requirements.txt
 # Copy application code
-COPY --chown=user . .
 EXPOSE 7860
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

+FROM python:3.10-slim
+# Set environment
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+ENV PATH="/home/user/.local/bin:$PATH"
+# HF Spaces Free Tier: Force CPU-only mode
+ENV DOLPHIN_DEVICE=cpu
+ENV DOLPHIN_MAX_BATCH_SIZE=1
+# Install system dependencies (poppler for pdf2image)
+RUN apt-get update && apt-get install -y \
+    poppler-utils \
+    git \
     && rm -rf /var/lib/apt/lists/*
+# Create user (Hugging Face Spaces runs as user 1000)
 RUN useradd -m -u 1000 user
 USER user
 WORKDIR /home/user/app
 # Copy application code
+COPY --chown=user:user . .
+# Install Dependencies
+RUN pip3 install --no-cache-dir --upgrade pip && \
+    pip3 install --no-cache-dir -r requirements.txt
+# Create models directory for cached model weights
+RUN mkdir -p /home/user/app/models/dolphin-v2
+# Expose port (HF Spaces defaults to 7860)
 EXPOSE 7860
+# Start the worker directly (no ngrok, no bash wrapper)
+CMD ["python3", "-m", "uvicorn", "worker:app", "--host", "0.0.0.0", "--port", "7860"]

app/__init__.py ADDED Viewed

	@@ -0,0 +1,98 @@

+"""
+Visique Backend Application
+This package contains the backend API and services for the Visique financial analysis platform.
+## Architecture Overview
+```
+app/
+├── api/                    # FastAPI route handlers
+│   ├── admin.py           # Admin console endpoints (users, reports, features)
+│   ├── auth.py            # Authentication (login, register, JWT)
+│   └── endpoints.py       # Analysis endpoints (upload, simulate, report)
+│
+├── core/                   # Core configuration and utilities
+│   ├── config.py          # Environment settings (API keys, URLs)
+│   ├── database.py        # SQLAlchemy database connection
+│   ├── security.py        # JWT token creation/validation
+│   ├── feature_registry.py # Auto-discoverable feature definitions
+│   └── plan_config.py     # Plan limits and default features
+│
+├── models/                 # SQLAlchemy database models
+│   ├── user.py            # User, Analysis, Payment models
+│   └── feature_flags.py   # PlanFeatureOverride, PlanUploadLimit
+│
+├── schemas/                # Pydantic request/response schemas
+│   ├── user.py            # UserCreate, UserResponse, etc.
+│   ├── financial.py       # StandardizedDataPackage, KPIs, etc.
+│   └── chat.py            # ChatRequest, ChatResponse
+│
+├── services/               # Business logic layer
+│   ├── feature_service.py # Feature flag resolution logic
+│   ├── analysis/          # Financial analysis modules
+│   │   ├── fundamental.py # Main analysis orchestrator
+│   │   ├── kpi.py         # KPI calculations
+│   │   ├── risk.py        # Risk analysis
+│   │   ├── health_score.py # Health score computation
+│   │   ├── growth.py      # Growth metrics
+│   │   └── simulation.py  # What-if scenario modeling
+│   ├── ingestion/         # Data parsing
+│   │   ├── parser_csv.py  # CSV file parsing
+│   │   ├── parser_pdf.py  # PDF extraction + OCR
+│   │   └── mappings.py    # Field name normalization
+│   ├── intelligence/      # AI-powered features
+│   │   ├── gemini_service.py # Gemini API integration
+│   │   ├── ai_cfo.py      # AI CFO chat functionality
+│   │   ├── geo_service.py # Geo-strategic analysis
+│   │   └── rag.py         # RAG for document QA
+│   └── reporting/         # Report generation
+│       ├── pdf_report.py  # PDF report builder
+│       └── pptx_report.py # PowerPoint builder
+│
+└── main.py                 # FastAPI app initialization
+```
+## Module Responsibilities
+### API Layer (`api/`)
+- HTTP request handling only
+- Input validation via Pydantic
+- Delegates all logic to services
+- Returns standardized responses
+### Core Layer (`core/`)
+- Application-wide configuration
+- Feature registry (add new features here)
+- Plan configuration (modify limits here)
+- Security utilities (JWT)
+### Models Layer (`models/`)
+- Database schema definitions
+- Relationships between entities
+- No business logic
+### Schemas Layer (`schemas/`)
+- Request/response validation
+- Data transformation for API
+- Type hints for IDE support
+### Services Layer (`services/`)
+- All business logic lives here
+- Each subdirectory is a domain
+- Services are stateless and testable
+## Adding New Features
+1. **New Feature Flag**: Add to `core/feature_registry.py`
+2. **New API Endpoint**: Add to appropriate `api/*.py`
+3. **New Service Logic**: Create in `services/` subdirectory
+4. **New Model Field**: Add to `models/` and run migration
+## Key Design Patterns
+- **Repository Pattern**: Services interact with DB via session
+- **Dependency Injection**: FastAPI `Depends()` for DB/auth
+- **Single Responsibility**: Each module has one clear purpose
+- **Feature Registry**: Auto-discoverable, category-organized
+"""

app/api/admin.py ADDED Viewed

	@@ -0,0 +1,606 @@

+from fastapi import APIRouter, Depends, HTTPException, status
+from sqlalchemy.orm import Session
+from typing import List, Optional
+from app.core.database import get_db
+from app.models.user import User, Payment, Analysis
+from app.schemas.user import UserResponse, PaymentResponse
+from app.api.auth import get_current_user
+import os
+router = APIRouter(prefix="/admin", tags=["admin"])
+def get_current_admin(current_user: User = Depends(get_current_user)):
+    if not current_user.is_admin:
+        raise HTTPException(
+            status_code=status.HTTP_403_FORBIDDEN,
+            detail="The user doesn't have enough privileges",
+        )
+    return current_user
+@router.get("/payments", response_model=List[PaymentResponse])
+def read_all_payments(
+    skip: int = 0,
+    limit: int = 100,
+    db: Session = Depends(get_db),
+    current_user: User = Depends(get_current_admin)
+):
+    payments = db.query(Payment).offset(skip).limit(limit).all()
+    return payments
+@router.delete("/users/{user_id}", status_code=status.HTTP_204_NO_CONTENT)
+def delete_user(
+    user_id: int,
+    db: Session = Depends(get_db),
+    current_user: User = Depends(get_current_admin)
+):
+    user = db.query(User).filter(User.id == user_id).first()
+    if not user:
+        raise HTTPException(status_code=404, detail="User not found")
+    if user.id == current_user.id:
+        raise HTTPException(status_code=400, detail="Cannot delete your own admin account")
+    db.delete(user)
+    db.commit()
+    return None
+from pydantic import BaseModel
+class AdminUserUpdate(BaseModel):
+    full_name: Optional[str] = None
+    company_name: Optional[str] = None
+    plan: Optional[str] = None
+    is_admin: Optional[bool] = None
+    is_super_admin: Optional[bool] = None
+    visique_id: Optional[str] = None
+    ein: Optional[str] = None
+    address: Optional[str] = None
+    industry: Optional[str] = None
+class FeatureToggleRequest(BaseModel):
+    feature_states: dict  # {feature_id: bool}
+@router.put("/users/{user_id}", response_model=UserResponse)
+def update_user_admin(
+    user_id: int,
+    user_update: AdminUserUpdate,
+    db: Session = Depends(get_db),
+    current_user: User = Depends(get_current_admin)
+):
+    user = db.query(User).filter(User.id == user_id).first()
+    if not user:
+        raise HTTPException(status_code=404, detail="User not found")
+    # Check if target is admin and requester is not super admin
+    if user.is_admin and not current_user.is_super_admin:
+        raise HTTPException(
+            status_code=403,
+            detail="Only Special Admins can edit Admin profiles"
+        )
+    update_data = user_update.dict(exclude_unset=True)
+    for key, value in update_data.items():
+        # Only super admins can change is_super_admin status
+        if key == "is_super_admin" and not current_user.is_super_admin:
+            continue
+        setattr(user, key, value)
+    db.commit()
+    db.refresh(user)
+    return user
+@router.put("/users/{user_id}/features")
+def update_user_features(
+    user_id: int,
+    request: FeatureToggleRequest,
+    db: Session = Depends(get_db),
+    current_user: User = Depends(get_current_admin)
+):
+    """
+    Update custom feature overrides for a specific user.
+    """
+    user = db.query(User).filter(User.id == user_id).first()
+    if not user:
+        raise HTTPException(status_code=404, detail="User not found")
+    # Get current and merge
+    current_features = user.custom_features or {}
+    # Handle SQLite parsing if needed
+    if isinstance(current_features, str):
+        import json
+        try:
+            current_features = json.loads(current_features)
+        except:
+            current_features = {}
+    # Ensure it's a dict copy to trigger mutation detection
+    new_features = dict(current_features)
+    for k, v in request.feature_states.items():
+        new_features[k] = v
+    user.custom_features = new_features
+    from sqlalchemy.orm.attributes import flag_modified
+    flag_modified(user, "custom_features")
+    db.commit()
+    return {
+        "status": "success",
+        "user_id": user.id,
+        "custom_features": user.custom_features
+    }
+class EngineUpdateRequest(BaseModel):
+    engine: str
+@router.put("/users/{user_id}/engine")
+def update_user_engine(
+    user_id: int,
+    request: EngineUpdateRequest,
+    db: Session = Depends(get_db),
+    current_user: User = Depends(get_current_admin)
+):
+    """
+    Update a user's preferred engine (v1 or v2).
+    """
+    user = db.query(User).filter(User.id == user_id).first()
+    if not user:
+        raise HTTPException(status_code=404, detail="User not found")
+    if request.engine not in ["v1", "v2"]:
+        raise HTTPException(status_code=400, detail="Invalid engine. Must be 'v1' or 'v2'")
+    user.preferred_engine = request.engine
+    db.commit()
+    db.refresh(user)
+    return {"status": "success", "user_id": user.id, "preferred_engine": user.preferred_engine}
+@router.get("/users", response_model=List[UserResponse])
+def read_all_users(
+    skip: int = 0,
+    limit: int = 100,
+    search: Optional[str] = None,
+    db: Session = Depends(get_db),
+    current_user: User = Depends(get_current_admin)
+):
+    query = db.query(User)
+    if search:
+        # Search by Visique ID (exact or partial) or Email or Name
+        search_filter = f"%{search}%"
+        query = query.filter(
+            (User.email.ilike(search_filter)) |
+            (User.full_name.ilike(search_filter)) |
+            (User.visique_id.ilike(search_filter))
+        )
+    return query.offset(skip).limit(limit).all()
+@router.get("/analyses")
+def read_all_analyses(
+    skip: int = 0,
+    limit: int = 100,
+    db: Session = Depends(get_db),
+    current_user: User = Depends(get_current_admin)
+):
+    """
+    Get all analyses from all users.
+    Returns a simplified list for the admin dashboard.
+    """
+    # Join with User to get owner details
+    analyses = db.query(Analysis).join(User).order_by(Analysis.timestamp.desc()).offset(skip).limit(limit).all()
+    result = []
+    for a in analyses:
+        result.append({
+            "id": a.id,
+            "company_name": a.company_name,
+            "filename": a.input_filename,
+            "timestamp": a.timestamp,
+            "owner_email": a.owner.email,
+            "owner_visique_id": a.owner.visique_id
+        })
+    return result
+@router.delete("/analyses/{analysis_id}", status_code=status.HTTP_204_NO_CONTENT)
+def delete_analysis_admin(
+    analysis_id: int,
+    db: Session = Depends(get_db),
+    current_user: User = Depends(get_current_admin)
+):
+    analysis = db.query(Analysis).filter(Analysis.id == analysis_id).first()
+    if not analysis:
+        raise HTTPException(status_code=404, detail="Analysis not found")
+    # Delete file from disk
+    if analysis.stored_filename and os.path.exists(analysis.stored_filename):
+        try:
+            os.remove(analysis.stored_filename)
+        except OSError:
+            pass # Continue even if file delete fails
+    db.delete(analysis)
+    db.commit()
+    return None
+# =============================================================================
+# USAGE TRACKING ENDPOINTS
+# =============================================================================
+@router.get("/usage")
+def get_usage_stats(
+    db: Session = Depends(get_db),
+    current_user: User = Depends(get_current_admin)
+):
+    """
+    Get upload usage statistics for all users.
+    Shows uploads used, limit, and percentage for admin dashboard.
+    """
+    from app.services.feature_service import get_effective_upload_limit
+    users = db.query(User).all()
+    result = []
+    for user in users:
+        plan = user.plan or "Individual"
+        if user.is_admin:
+            plan = "Admin"
+        limit = get_effective_upload_limit(db, plan)
+        used = user.monthly_upload_count or 0
+        percentage = round((used / limit * 100), 1) if limit > 0 else 0
+        result.append({
+            "id": user.id,
+            "email": user.email,
+            "full_name": user.full_name,
+            "visique_id": user.visique_id,
+            "plan": plan,
+            "uploads_used": used,
+            "uploads_limit": limit,
+            "usage_percentage": percentage,
+            "reset_date": user.upload_reset_date.isoformat() if user.upload_reset_date else None
+        })
+    # Sort by usage percentage descending
+    result.sort(key=lambda x: x["usage_percentage"], reverse=True)
+    return result
+# =============================================================================
+# FEATURE FLAG ENDPOINTS
+# =============================================================================
+@router.get("/features")
+def get_feature_matrix(
+    db: Session = Depends(get_db),
+    current_user: User = Depends(get_current_admin)
+):
+    """
+    Get the full feature matrix for admin console.
+    Shows all features grouped by category with per-plan toggles.
+    """
+    from app.services.feature_service import get_feature_matrix as get_matrix
+    return get_matrix(db)
+@router.get("/features/registry")
+def get_feature_registry(
+    current_user: User = Depends(get_current_admin)
+):
+    """
+    Get the feature registry - all available features.
+    Useful for understanding what features can be controlled.
+    """
+    from app.core.feature_registry import get_features_by_category, get_all_feature_ids
+    categories = get_features_by_category()
+    result = {}
+    for cat_name, features in categories.items():
+        result[cat_name] = [
+            {
+                "id": f.id,
+                "name": f.name,
+                "description": f.description,
+                "default_enabled": f.default_enabled
+            }
+            for f in features
+        ]
+    return {
+        "total_features": len(get_all_feature_ids()),
+        "categories": result
+    }
+@router.get("/features/{plan_name}")
+def get_plan_features(
+    plan_name: str,
+    db: Session = Depends(get_db),
+    current_user: User = Depends(get_current_admin)
+):
+    """
+    Get enabled features for a specific plan.
+    """
+    from app.services.feature_service import get_effective_features, get_effective_upload_limit
+    from app.core.plan_config import get_all_plans, get_all_engines
+    if plan_name not in get_all_plans() and plan_name not in get_all_engines():
+        raise HTTPException(status_code=404, detail=f"Unknown plan: {plan_name}")
+    return {
+        "plan": plan_name,
+        "upload_limit": get_effective_upload_limit(db, plan_name),
+        "enabled_features": get_effective_features(db, plan_name)
+    }
+@router.put("/features/{plan_name}")
+def update_plan_features(
+    plan_name: str,
+    request: FeatureToggleRequest,
+    db: Session = Depends(get_db),
+    current_user: User = Depends(get_current_admin)
+):
+    """
+    Bulk update features for a plan.
+    """
+    from app.services.feature_service import bulk_set_features
+    from app.core.plan_config import get_all_plans, get_all_engines
+    if plan_name not in get_all_plans() and plan_name not in get_all_engines():
+        raise HTTPException(status_code=404, detail=f"Unknown plan: {plan_name}")
+    count = bulk_set_features(db, plan_name, request.feature_states, current_user.id)
+    return {
+        "message": f"Updated {count} features for {plan_name}",
+        "plan": plan_name,
+        "updated_count": count
+    }
+@router.post("/features/{plan_name}/reset")
+def reset_plan_features(
+    plan_name: str,
+    db: Session = Depends(get_db),
+    current_user: User = Depends(get_current_admin)
+):
+    """
+    Reset a plan's features to defaults (removes all overrides).
+    """
+    from app.services.feature_service import reset_plan_to_defaults
+    from app.core.plan_config import get_all_plans, get_all_engines
+    if plan_name not in get_all_plans() and plan_name not in get_all_engines():
+        raise HTTPException(status_code=404, detail=f"Unknown plan: {plan_name}")
+    count = reset_plan_to_defaults(db, plan_name)
+    return {
+        "message": f"Reset {plan_name} to defaults, removed {count} overrides",
+        "plan": plan_name,
+        "removed_overrides": count
+    }
+class UploadLimitRequest(BaseModel):
+    upload_limit: int
+@router.put("/features/{plan_name}/limit")
+def update_plan_upload_limit(
+    plan_name: str,
+    request: UploadLimitRequest,
+    db: Session = Depends(get_db),
+    current_user: User = Depends(get_current_admin)
+):
+    """
+    Update upload limit for a plan.
+    """
+    from app.models.feature_flags import PlanUploadLimit
+    from app.core.plan_config import get_all_plans
+    if plan_name not in get_all_plans():
+        raise HTTPException(status_code=404, detail=f"Unknown plan: {plan_name}")
+    # Find or create limit override
+    override = db.query(PlanUploadLimit).filter(
+        PlanUploadLimit.plan_name == plan_name
+    ).first()
+    if override:
+        override.upload_limit = request.upload_limit
+        override.updated_by_id = current_user.id
+    else:
+        override = PlanUploadLimit(
+            plan_name=plan_name,
+            upload_limit=request.upload_limit,
+            updated_by_id=current_user.id
+        )
+        db.add(override)
+    db.commit()
+    return {
+        "message": f"Updated upload limit for {plan_name}",
+        "plan": plan_name,
+        "new_limit": request.upload_limit
+    }
+# =============================================================================
+# CLASSIFIER TRAINING ENDPOINTS
+# =============================================================================
+from fastapi import UploadFile, File, Form
+@router.get("/classifier/doc-types")
+def get_classifier_doc_types(
+    current_user: User = Depends(get_current_admin)
+):
+    """
+    Get all 53 document types for the training UI dropdown.
+    Returns summary list with id, key, display_name, category, keyword_count.
+    """
+    from app.services.ingestion.doc_keywords import get_all_doc_types_summary
+    return get_all_doc_types_summary()
+@router.post("/classifier/train")
+async def train_classifier(
+    doc_type: str = Form(...),
+    files: List[UploadFile] = File(...),
+    current_user: User = Depends(get_current_admin)
+):
+    """
+    Upload up to 5 reference PDFs for a specific doc type.
+    Extracts candidate keywords that admins can review and approve.
+    Uses thread pool for PDF processing to prevent blocking.
+    """
+    from app.services.ingestion.keyword_learner import (
+        extract_candidate_keywords, MAX_TRAINING_FILES
+    )
+    from app.services.ingestion.doc_keywords import DOC_TYPE_REGISTRY
+    from pypdf import PdfReader
+    import tempfile
+    import shutil
+    import time
+    from fastapi.concurrency import run_in_threadpool
+    request_start = time.time()
+    print(f"[Train] Started for doc_type={doc_type}, files={len(files)}")
+    # Validate doc type
+    if doc_type not in DOC_TYPE_REGISTRY:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unknown doc type: {doc_type}. Use GET /admin/classifier/doc-types for valid types."
+        )
+    # Validate file count
+    if len(files) > MAX_TRAINING_FILES:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Maximum {MAX_TRAINING_FILES} files allowed per training batch."
+        )
+    # Helper function to run in threadpool
+    def extract_pdf_text(path: str) -> str:
+        from app.services.ingestion.hybrid_parser import HybridPDFParser
+        try:
+            parser = HybridPDFParser()
+            result = parser.parse(path)
+            # The hybrid parser returns a combined markdown string
+            return result.full_markdown if result and result.full_markdown else ""
+        except Exception as e:
+            print(f"Hybrid PDF Extraction Error: {e}")
+            return ""
+    # Process files
+    texts = []
+    errors = []
+    for f in files:
+        if not f.filename.lower().endswith(".pdf"):
+            errors.append(f"Skipped non-PDF: {f.filename}")
+            continue
+        tmp_path = None
+        try:
+            # Stream to temp file (low memory usage)
+            file_start = time.time()
+            with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
+                shutil.copyfileobj(f.file, tmp)
+                tmp_path = tmp.name
+            file_size = os.path.getsize(tmp_path) / 1024
+            print(f"[Train] File '{f.filename}' saved ({file_size:.0f}KB) in {time.time() - file_start:.2f}s")
+            # Run extraction in thread pool (non-blocking)
+            extract_start = time.time()
+            text = await run_in_threadpool(extract_pdf_text, tmp_path)
+            print(f"[Train] Extraction completed in {time.time() - extract_start:.2f}s, {len(text)} chars")
+            if text.strip():
+                texts.append(text)
+            else:
+                errors.append(f"No text extracted from: {f.filename}")
+        except Exception as e:
+            print(f"[Train] ERROR processing {f.filename}: {e}")
+            errors.append(f"Failed to process {f.filename}: {str(e)}")
+        finally:
+            # Clean up temp file
+            if tmp_path and os.path.exists(tmp_path):
+                try:
+                    os.unlink(tmp_path)
+                except:
+                    pass
+    if not texts:
+        raise HTTPException(
+            status_code=400,
+            detail=f"No valid text extracted from any files. Errors: {errors}"
+        )
+    # Extract candidate keywords (CPU intensive, also run in threadpool)
+    kw_start = time.time()
+    candidates = await run_in_threadpool(extract_candidate_keywords, texts, doc_type)
+    print(f"[Train] Keyword extraction completed in {time.time() - kw_start:.2f}s, {len(candidates)} candidates")
+    print(f"[Train] TOTAL request time: {time.time() - request_start:.2f}s")
+    return {
+        "doc_type": doc_type,
+        "files_processed": len(texts),
+        "candidates": candidates,
+        "errors": errors if errors else None,
+    }
+class KeywordApprovalRequest(BaseModel):
+    doc_type: str
+    keywords: List[str]
+@router.post("/classifier/approve")
+def approve_classifier_keywords(
+    request: KeywordApprovalRequest,
+    current_user: User = Depends(get_current_admin)
+):
+    """
+    Approve candidate keywords and persist them to the learned registry.
+    """
+    from app.services.ingestion.keyword_learner import approve_keywords
+    from app.services.ingestion.doc_keywords import DOC_TYPE_REGISTRY
+    if request.doc_type not in DOC_TYPE_REGISTRY:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unknown doc type: {request.doc_type}"
+        )
+    if not request.keywords:
+        raise HTTPException(
+            status_code=400,
+            detail="No keywords provided."
+        )
+    result = approve_keywords(request.doc_type, request.keywords)
+    return result
+@router.get("/classifier/stats")
+def get_classifier_stats(
+    current_user: User = Depends(get_current_admin)
+):
+    """
+    Get classifier training statistics for the admin dashboard.
+    """
+    from app.services.ingestion.keyword_learner import get_training_stats
+    return get_training_stats()

app/api/auth.py ADDED Viewed

	@@ -0,0 +1,273 @@

+from datetime import datetime, timedelta
+from typing import Optional
+from fastapi import APIRouter, Depends, HTTPException, status
+from fastapi.security import OAuth2PasswordBearer, OAuth2PasswordRequestForm
+from jose import JWTError, jwt
+from passlib.context import CryptContext
+from sqlalchemy.orm import Session
+from app.core.database import get_db
+from app.models.user import User
+from app.schemas.user import UserCreate, UserResponse, Token, UpgradeRequest
+from app.core.security import SECRET_KEY, ALGORITHM, ACCESS_TOKEN_EXPIRE_MINUTES
+from app.core.security import verify_password, get_password_hash, create_access_token, ALGORITHM, SECRET_KEY, ACCESS_TOKEN_EXPIRE_MINUTES
+router = APIRouter(prefix="/auth", tags=["auth"])
+@router.get("/probe")
+def probe():
+    return {"status": "auth_router_working"}
+from fastapi import Query
+oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/api/v1/auth/login", auto_error=False)
+@router.post("/register", response_model=UserResponse)
+def register(user: UserCreate, db: Session = Depends(get_db)):
+    try:
+        db_user = db.query(User).filter(User.email == user.email).first()
+        if db_user:
+            raise HTTPException(status_code=400, detail="Email already registered")
+        hashed_password = get_password_hash(user.password)
+        # Valid Admin Keys
+        VALID_ADMIN_KEYS = [
+            "VSQADM001", "VSQADM002", "VSQADM003",
+            "VSQADM004", "VSQADM005", "VSQADM006"
+        ]
+        # Check Admin Key
+        is_admin = False
+        is_super_admin = False
+        SUPER_ADMIN_KEYS = ["VSQADM003", "VSQADM006"]
+        if user.admin_key and user.admin_key in VALID_ADMIN_KEYS:
+            is_admin = True
+            if user.admin_key in SUPER_ADMIN_KEYS:
+                is_super_admin = True
+        # Generate Visique ID
+        import uuid
+        import random
+        if is_admin:
+            # VISI-###### (6 digits)
+            digits = ''.join([str(random.randint(0, 9)) for _ in range(6)])
+            visique_id = f"VISI-{digits}"
+        else:
+            visique_id = f"VSQ-{str(uuid.uuid4())[:8].upper()}"
+        new_user = User(
+            email=user.email,
+            hashed_password=hashed_password,
+            full_name=user.full_name,
+            company_name=user.company_name,
+            is_admin=is_admin,
+            is_super_admin=is_super_admin,
+            visique_id=visique_id
+        )
+        db.add(new_user)
+        db.commit()
+        db.refresh(new_user)
+        return new_user
+    except HTTPException as he:
+        raise he
+    except Exception as e:
+        print(f"Registration Error: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Registration failed: {str(e)}")
+@router.post("/login", response_model=Token)
+def login(form_data: OAuth2PasswordRequestForm = Depends(), db: Session = Depends(get_db)):
+    user = db.query(User).filter(User.email == form_data.username).first()
+    if not user or not verify_password(form_data.password, user.hashed_password):
+        raise HTTPException(
+            status_code=status.HTTP_401_UNAUTHORIZED,
+            detail="Incorrect username or password",
+            headers={"WWW-Authenticate": "Bearer"},
+        )
+    access_token_expires = timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
+    access_token = create_access_token(
+        data={"sub": user.email}, expires_delta=access_token_expires
+    )
+    return {"access_token": access_token, "token_type": "bearer"}
+async def get_current_user(
+    token: Optional[str] = Depends(oauth2_scheme),
+    db: Session = Depends(get_db),
+    # Frontend passes ?token=... for downloads
+    query_token: Optional[str] = Query(None, alias="token")
+):
+    actual_token = token or query_token
+    credentials_exception = HTTPException(
+        status_code=status.HTTP_401_UNAUTHORIZED,
+        detail="Could not validate credentials",
+        headers={"WWW-Authenticate": "Bearer"},
+    )
+    if not actual_token:
+        raise credentials_exception
+    try:
+        payload = jwt.decode(actual_token, SECRET_KEY, algorithms=[ALGORITHM])
+        email: str = payload.get("sub")
+        if email is None:
+            raise credentials_exception
+    except JWTError:
+        raise credentials_exception
+    user = db.query(User).filter(User.email == email).first()
+    if user is None:
+        raise credentials_exception
+    return user
+@router.get("/me", response_model=UserResponse)
+async def read_users_me(current_user: User = Depends(get_current_user)):
+    return current_user
+from pydantic import BaseModel
+class ProfileUpdate(BaseModel):
+    full_name: Optional[str] = None
+    company_name: Optional[str] = None
+    address: Optional[str] = None
+@router.patch("/me", response_model=UserResponse)
+async def update_profile(
+    updates: ProfileUpdate,
+    current_user: User = Depends(get_current_user),
+    db: Session = Depends(get_db)
+):
+    if updates.full_name is not None:
+        current_user.full_name = updates.full_name
+    if updates.company_name is not None and updates.company_name != current_user.company_name:
+        current_user.company_name = updates.company_name
+        # Propagate to all past analyses
+        from app.models.user import Analysis
+        import json
+        analyses = db.query(Analysis).filter(Analysis.user_id == current_user.id).all()
+        for analysis in analyses:
+            analysis.company_name = updates.company_name
+            if analysis.result_json:
+                try:
+                    data = json.loads(analysis.result_json)
+                    if "raw_data" in data and "company_name" in data["raw_data"]:
+                        data["raw_data"]["company_name"] = updates.company_name
+                        analysis.result_json = json.dumps(data)
+                except Exception as e:
+                    print(f"Error updating result_json for analysis {analysis.id}: {e}")
+    if updates.address is not None:
+        current_user.address = updates.address
+    db.commit()
+    db.refresh(current_user)
+    return current_user
+from app.core.config import settings
+from app.core.stripe_config import create_checkout_session
+import stripe
+from fastapi import Request
+@router.post("/create-checkout-session")
+def create_payment(
+    plan_id: str, # Pass the Stripe Price ID
+    current_user: User = Depends(get_current_user),
+    db: Session = Depends(get_db)
+):
+    session = create_checkout_session(current_user, plan_id)
+    if not session:
+        raise HTTPException(status_code=400, detail="Error creating payment session")
+    return {"url": session.url}
+@router.post("/webhook")
+async def stripe_webhook(request: Request, db: Session = Depends(get_db)):
+    payload = await request.body()
+    sig_header = request.headers.get("stripe-signature")
+    try:
+        event = stripe.Webhook.construct_event(
+            payload, sig_header, settings.STRIPE_WEBHOOK_SECRET
+        )
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail="Invalid payload")
+    except stripe.error.SignatureVerificationError as e:
+        raise HTTPException(status_code=400, detail="Invalid signature")
+    if event["type"] == "checkout.session.completed":
+        session = event["data"]["object"]
+        # Retrieve user and update plan
+        # Note: metadata values are strings
+        user_id = session.get("client_reference_id")
+        if user_id:
+             user = db.query(User).filter(User.id == int(user_id)).first()
+             if user:
+                 user.plan = "Business" # Or derive from session
+                 user.plan_expires_at = datetime.utcnow() + timedelta(days=30)
+                 # Record Payment
+                 from app.models.user import Payment
+                 new_payment = Payment(
+                    user_id=user.id,
+                    amount=session.get("amount_total", 0) / 100.0,
+                    status="paid",
+                    plan_name="Business",
+                    date=datetime.utcnow()
+                 )
+                 db.add(new_payment)
+                 db.commit()
+    return {"status": "success"}
+from typing import List
+from app.schemas.user import PaymentResponse
+from app.models.user import Payment
+@router.get("/payments/me", response_model=List[PaymentResponse])
+def read_my_payments(current_user: User = Depends(get_current_user), db: Session = Depends(get_db)):
+    return db.query(Payment).filter(Payment.user_id == current_user.id).all()
+from fastapi import UploadFile, File
+from fastapi.responses import RedirectResponse
+from app.services.storage import StorageService
+import uuid
+@router.post("/me/avatar")
+async def upload_avatar(
+    file: UploadFile = File(...),
+    current_user: User = Depends(get_current_user),
+    db: Session = Depends(get_db)
+):
+    # Determine file extension
+    ext = file.filename.split(".")[-1]
+    if ext.lower() not in ["jpg", "jpeg", "png", "webp"]:
+         raise HTTPException(status_code=400, detail="Invalid image format. Use JPG, PNG, or WebP.")
+    # Save file to R2
+    safe_filename = f"{current_user.id}_{uuid.uuid4()}.{ext}"
+    object_key = f"avatars/{safe_filename}"
+    try:
+        await StorageService.upload_file(file, object_key)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Avatar upload failed: {str(e)}")
+    # Update User Profile
+    # Point to the proxy endpoint
+    current_user.profile_picture_url = f"/api/v1/auth/avatars/{safe_filename}"
+    db.commit()
+    db.refresh(current_user)
+    return {"message": "Avatar updated", "url": current_user.profile_picture_url}
+@router.get("/avatars/{filename}")
+async def get_avatar(filename: str):
+    """
+    Proxy endpoint for avatars.
+    Redirects to a short-lived presigned URL on R2.
+    """
+    object_key = f"avatars/{filename}"
+    try:
+        url = StorageService.get_presigned_url(object_key, expiration=3600) # 1 hour cache
+        return RedirectResponse(url=url)
+    except Exception:
+        raise HTTPException(status_code=404, detail="Avatar not found")

app/api/endpoints.py ADDED Viewed

	@@ -0,0 +1,830 @@

+from fastapi import APIRouter, UploadFile, File, HTTPException, Depends
+from fastapi.concurrency import run_in_threadpool
+from fastapi.responses import RedirectResponse
+from fastapi.security import OAuth2PasswordBearer, OAuth2PasswordRequestForm
+from app.schemas.financial import StandardizedDataPackage
+from app.services.storage import StorageService
+from app.core.security import create_access_token
+from typing import Annotated
+from pydantic import BaseModel
+from datetime import date
+import os
+from app.services.ingestion.parser_csv import CSVParser
+from app.services.ingestion.parser_pdf import PDFParser
+from app.services.analysis.kpi import KPIAnalyzer
+from app.services.analysis.risk import RiskAnalyzer
+from app.services.analysis.health_score import HealthScoreAnalyzer
+from app.services.analysis.fundamental import FundamentalAnalyzer
+from app.services.analysis.factory import AnalysisFactory
+from app.services.analysis.growth import GrowthAnalyzer
+from app.services.analysis.simulation import SimulationService
+from app.services.reporting.pdf_report import PDFReporter
+from app.services.reporting.pptx_report import PPTXReporter
+from app.schemas.financial import StandardizedDataPackage, FinancialReport, IncomeStatementStandard, BalanceSheetStandard, CashFlowStandard, KPIMetrics, RiskAnalysis, HealthScoreBreakdown
+from financial_model.models import VisiVeritasReport
+from app.schemas.chat import ChatRequest, ChatResponse
+from app.api.auth import get_current_user
+from app.models.user import User, Analysis
+from app.core.database import get_db
+from sqlalchemy.orm import Session
+import json
+from fastapi.responses import FileResponse
+from app.services.feature_service import get_effective_features
+router = APIRouter(prefix="/analysis", tags=["analysis"])
+oauth2_scheme = OAuth2PasswordBearer(tokenUrl="token")
+@router.post("/token")
+async def login(form_data: Annotated[OAuth2PasswordRequestForm, Depends()]):
+    # Mock User DB (kept for legacy demo, but real auth is at /auth/login)
+    if form_data.username == "analyst" and form_data.password == "visique":
+        return {"access_token": create_access_token(data={"sub": form_data.username}), "token_type": "bearer"}
+    raise HTTPException(status_code=400, detail="Incorrect username or password")
+@router.get("/doc-types")
+def get_doc_types(current_user: User = Depends(get_current_user)):
+    """Get all supported document types for the upload dropdown."""
+    from app.services.ingestion.doc_keywords import get_all_doc_types_summary
+    return get_all_doc_types_summary()
+# Admin Dependency
+def get_current_admin(current_user: User = Depends(get_current_user)):
+    if not current_user.is_admin:
+        raise HTTPException(status_code=403, detail="Admin privileges required")
+    return current_user
+    if not current_user.is_admin:
+        raise HTTPException(status_code=403, detail="Admin privileges required")
+    return current_user
+@router.get("/admin/users")
+def get_all_users(
+    admin: User = Depends(get_current_admin),
+    db: Session = Depends(get_db)
+):
+    users = db.query(User).all()
+    return [
+        {
+            "id": u.id,
+            "email": u.email,
+            "full_name": u.full_name,
+            "company_name": u.company_name,
+            "is_admin": u.is_admin,
+            "created_at": u.created_at,
+            "analysis_count": len(u.analyses),
+            "preferred_engine": getattr(u, "preferred_engine", "v1")
+        }
+        for u in users
+    ]
+@router.get("/admin/analyses")
+def get_all_analyses(
+    admin: User = Depends(get_current_admin),
+    db: Session = Depends(get_db)
+):
+    analyses = db.query(Analysis).order_by(Analysis.timestamp.desc()).all()
+    return [
+        {
+            "id": a.id,
+            "user_email": a.owner.email,
+            "user_company": a.owner.company_name,
+            "company_name": a.company_name,
+            "filename": a.input_filename,
+            "timestamp": a.timestamp,
+        }
+        for a in analyses
+    ]
+@router.get("/admin/analyses/{analysis_id}/download")
+def admin_download_file(
+    analysis_id: int,
+    admin: User = Depends(get_current_admin),
+    db: Session = Depends(get_db)
+):
+    analysis = db.query(Analysis).filter(Analysis.id == analysis_id).first()
+    if not analysis or not analysis.stored_filename:
+        raise HTTPException(status_code=404, detail="File not found")
+    if not os.path.exists(analysis.stored_filename):
+        raise HTTPException(status_code=404, detail="File missing from server storage")
+    return FileResponse(
+        path=analysis.stored_filename,
+        filename=f"ADMIN_EXPORT_{analysis.input_filename}",
+        media_type='application/octet-stream'
+    )
+import json
+# Admin Dependency
+def get_current_admin(current_user: User = Depends(get_current_user)):
+    if not current_user.is_admin:
+        raise HTTPException(status_code=403, detail="Admin privileges required")
+    return current_user
+@router.post("/upload/csv", response_model=StandardizedDataPackage)
+async def analyze_csv(
+    file: UploadFile = File(...),
+    current_user: User = Depends(get_current_user),
+    db: Session = Depends(get_db)
+):
+    # Check upload limit
+    from app.services.feature_service import check_upload_limit, increment_upload_count
+    limit_check = check_upload_limit(db, current_user)
+    if not limit_check["can_upload"]:
+        raise HTTPException(
+            status_code=403,
+            detail=f"Monthly upload limit reached ({limit_check['uploads_limit']} uploads). Upgrade your plan for more uploads. Resets on {limit_check['reset_date'][:10]}."
+        )
+    if not file.filename.endswith('.csv'):
+        raise HTTPException(status_code=400, detail="Invalid file type. Please upload a .csv file.")
+    # Secure filename and path
+    import uuid
+    safe_filename = f"{uuid.uuid4()}_{file.filename}"
+    object_key = f"uploads/{safe_filename}"
+    try:
+        # Upload to R2 (Service enforces size limit)
+        await StorageService.upload_file(file, object_key)
+        # Determine file type
+        # For analysis, we need the content. Fetch it back or stream it?
+        # Since parsers expect a file path, we might need to update parsers OR
+        # download to a temp file for parsing.
+        # Given current Parser implementations likely take a path, let's download to /tmp
+        temp_path = f"/tmp/{safe_filename}"
+        with open(temp_path, "wb") as f:
+             # We can likely read from the file object again if we seek 0,
+             # BUT StorageService.upload_file consumed it.
+             # Easier: fetch from R2 or write to tmp first, then upload?
+             # WRITING TO TMP FIRST IS SAFER for parsing, then upload for persistence.
+             # Wait, if we use R2 for persistence, we should upload first to ensure we can.
+             # Optimization: Write to tmp first (fast), parse, then upload background?
+             # Or: Upload to R2, then download to tmp for parsing?
+             # Or: Seek file to 0 and write to tmp.
+             file.file.seek(0)
+             import shutil
+             shutil.copyfileobj(file.file, f)
+        report = await run_in_threadpool(CSVParser.parse, temp_path)
+        # Clean up temp file
+        if os.path.exists(temp_path):
+            os.remove(temp_path)
+        # Run Unified Analysis (includes Phase 2 & 3 extensions)
+        # Select Engine based on User Preference
+        analyzer = AnalysisFactory.get_analyzer(current_user)
+        # Fetch enabled features for user's plan
+        enabled_features = get_effective_features(db, current_user.plan or "Free")
+        analysis_result = await run_in_threadpool(analyzer.analyze, report, user_address=current_user.address, enabled_features=enabled_features)
+        # The analyze() method returns: kpis, health_score, risk_analysis, insights (industry), recommendations, variance, runway, optimization
+        # Combine industry insights + recommendations + manual pain points if needed
+        # Note: FundamentalAnalyzer.analyze now handles most of this, but 'pain points' logic is inside recommendations or separate?
+        # Combine text insights
+        # Include risk_factors (which contain "Pain Point:" entries) in the insights array
+        risk_factors = analysis_result["risk_analysis"].risk_factors if analysis_result.get("risk_analysis") else []
+        all_insights = analysis_result["insights"] + analysis_result["recommendations"] + risk_factors
+        # ---- Visi-Veritas Hard-Halt Check ----
+        veritas_data = analysis_result.get("visi_veritas", {})
+        confidence = veritas_data.get("confidence_score", 100)
+        if confidence < 30:
+            raise HTTPException(
+                status_code=400,
+                detail={
+                    "error": "Visi-Veritas validation failed: The extracted financial data contains critical inaccuracies.",
+                    "confidence_score": confidence,
+                    "failed_rules": veritas_data.get("failed_rules", []),
+                    "warnings": veritas_data.get("warnings", []),
+                    "debug_context": veritas_data.get("debug_context", {}),
+                }
+            )
+        result_package = StandardizedDataPackage(
+            raw_data=report,
+            kpis=analysis_result["kpis"],
+            risk_analysis=analysis_result["risk_analysis"],
+            health_score=analysis_result["health_score"],
+            insights=all_insights,
+            runway_forecast=analysis_result["runway_forecast"],
+            optimization_insights=analysis_result["optimization_insights"],
+            geo_analysis=analysis_result.get("geo_analysis"),
+            visi_veritas=VisiVeritasReport(**veritas_data) if veritas_data else None,
+        )
+        # Save to DB
+        db_analysis = Analysis(
+            user_id=current_user.id,
+            company_name=report.company_name,
+            input_filename=file.filename,
+            stored_filename=object_key,
+            result_json=result_package.json()
+        )
+        db.add(db_analysis)
+        db.commit()
+        db.refresh(db_analysis)
+        result_package.analysis_id = db_analysis.id
+        result_package.timestamp = db_analysis.timestamp.isoformat()
+        result_package.runner_name = current_user.full_name or current_user.email
+        # Increment upload count AFTER successful save
+        increment_upload_count(db, current_user)
+        return result_package
+    except Exception as e:
+        # Cleanup if analysis fails
+        # If we uploaded to R2, strictly we should delete it?
+        # But if it failed before DB save, it's just an orphaned file.
+        # Ideally delete from R2.
+        # StorageService.delete_file(object_key)
+        raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
+from fastapi import Body
+@router.post("/save")
+async def save_analysis_result(
+    payload: dict = Body(...),
+    current_user: User = Depends(get_current_user),
+    db: Session = Depends(get_db)
+):
+    """
+    Receives pre-computed analysis results from Vercel serverless functions
+    and persists them to the database. This endpoint does NOT run analysis -
+    it only handles authentication and database storage.
+    """
+    from app.services.feature_service import increment_upload_count
+    try:
+        company_name = "Unknown"
+        raw_data = payload.get("raw_data", {})
+        if isinstance(raw_data, dict):
+            company_name = raw_data.get("company_name", "Unknown")
+        original_filename = payload.pop("original_filename", "uploaded_file")
+        db_analysis = Analysis(
+            user_id=current_user.id,
+            company_name=company_name,
+            input_filename=original_filename,
+            stored_filename="vercel_processed",
+            result_json=json.dumps(payload)
+        )
+        db.add(db_analysis)
+        db.commit()
+        db.refresh(db_analysis)
+        # Increment upload count
+        increment_upload_count(db, current_user)
+        return {"status": "saved", "analysis_id": db_analysis.id}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Failed to save analysis: {str(e)}")
+@router.get("/history")
+def get_history(
+    current_user: User = Depends(get_current_user),
+    db: Session = Depends(get_db)
+):
+    analyses = db.query(Analysis).filter(Analysis.user_id == current_user.id).order_by(Analysis.timestamp.desc()).all()
+    return [
+        {
+            "id": a.id,
+            "company_name": a.company_name,
+            "filename": a.input_filename,
+            "timestamp": a.timestamp,
+            "runner_name": a.owner.full_name or a.owner.email,
+        }
+        for a in analyses
+    ]
+@router.get("/history/{analysis_id}", response_model=StandardizedDataPackage)
+def get_analysis_detail(
+    analysis_id: int,
+    current_user: User = Depends(get_current_user),
+    db: Session = Depends(get_db)
+):
+    analysis = db.query(Analysis).filter(Analysis.id == analysis_id, Analysis.user_id == current_user.id).first()
+    if not analysis:
+        raise HTTPException(status_code=404, detail="Analysis not found")
+    pkg = StandardizedDataPackage.parse_raw(analysis.result_json)
+    pkg.analysis_id = analysis.id
+    pkg.timestamp = analysis.timestamp.isoformat()
+    pkg.runner_name = analysis.owner.full_name or analysis.owner.email
+    return pkg
+@router.get("/history/{analysis_id}/download")
+def download_original_file(
+    analysis_id: int,
+    current_user: User = Depends(get_current_user),
+    db: Session = Depends(get_db)
+):
+    analysis = db.query(Analysis).filter(Analysis.id == analysis_id, Analysis.user_id == current_user.id).first()
+    if not analysis or not analysis.stored_filename:
+        raise HTTPException(status_code=404, detail="File not found")
+    # Generate Presigned URL for Redirect
+    try:
+        url = StorageService.get_presigned_url(analysis.stored_filename)
+        return RedirectResponse(url=url)
+    except Exception as e:
+         raise HTTPException(status_code=404, detail="File missing from storage")
+@router.delete("/history/{analysis_id}")
+def delete_analysis(
+    analysis_id: int,
+    current_user: User = Depends(get_current_user),
+    db: Session = Depends(get_db)
+):
+    analysis = db.query(Analysis).filter(Analysis.id == analysis_id, Analysis.user_id == current_user.id).first()
+    if not analysis:
+        raise HTTPException(status_code=404, detail="Analysis not found")
+    # Delete file from R2
+    if analysis.stored_filename:
+        StorageService.delete_file(analysis.stored_filename)
+    db.delete(analysis)
+    db.commit()
+    return {"status": "success", "message": "Analysis deleted"}
+class UpdateAnalysisRequest(BaseModel):
+    company_name: str
+@router.patch("/history/{analysis_id}")
+def update_analysis(
+    analysis_id: int,
+    request: UpdateAnalysisRequest,
+    current_user: User = Depends(get_current_user),
+    db: Session = Depends(get_db)
+):
+    analysis = db.query(Analysis).filter(Analysis.id == analysis_id, Analysis.user_id == current_user.id).first()
+    if not analysis:
+        raise HTTPException(status_code=404, detail="Analysis not found")
+    analysis.company_name = request.company_name
+    # Update the stored JSON to reflect new name (consistency)
+    try:
+        data = json.loads(analysis.result_json)
+        data['raw_data']['company_name'] = request.company_name
+        analysis.result_json = json.dumps(data)
+    except:
+        pass # If JSON parsing fails, just update DB record
+    db.commit()
+    return {"status": "success", "message": "Analysis updated", "company_name": analysis.company_name}
+@router.post("/upload/pdf", response_model=StandardizedDataPackage)
+async def analyze_pdf(
+    file: UploadFile = File(...),
+    current_user: User = Depends(get_current_user),
+    db: Session = Depends(get_db)
+):
+    # Check upload limit
+    from app.services.feature_service import check_upload_limit, increment_upload_count
+    limit_check = check_upload_limit(db, current_user)
+    if not limit_check["can_upload"]:
+        raise HTTPException(
+            status_code=403,
+            detail=f"Monthly upload limit reached ({limit_check['uploads_limit']} uploads). Upgrade your plan for more uploads. Resets on {limit_check['reset_date'][:10]}."
+        )
+    if not file.filename.endswith('.pdf'):
+        raise HTTPException(status_code=400, detail="Invalid file type. Please upload a .pdf file.")
+    import uuid
+    safe_filename = f"{uuid.uuid4()}_{file.filename}"
+    object_key = f"uploads/{safe_filename}"
+    try:
+         # Upload to R2 (Limit Enforced)
+        await StorageService.upload_file(file, object_key)
+        # Parse logic
+        temp_path = f"/tmp/{safe_filename}"
+        file.file.seek(0)
+        with open(temp_path, "wb") as f:
+            import shutil
+            shutil.copyfileobj(file.file, f)
+        report = await run_in_threadpool(PDFParser.parse, temp_path)
+        # Cleanup temp
+        if os.path.exists(temp_path):
+            os.remove(temp_path)
+        # Run Unified Analysis
+        # Select Engine based on User Preference
+        analyzer = AnalysisFactory.get_analyzer(current_user)
+        # Resolve all feature flags (Plan + Custom + Engine limits)
+        from app.services.feature_service import resolve_user_features
+        enabled_features = resolve_user_features(db, current_user)
+        analysis_result = await run_in_threadpool(analyzer.analyze, report, user_address=current_user.address, enabled_features=enabled_features)
+        # Include risk_factors (which contain "Pain Point:" entries) in the insights array
+        risk_factors = analysis_result["risk_analysis"].risk_factors if analysis_result.get("risk_analysis") else []
+        all_insights = analysis_result["insights"] + analysis_result["recommendations"] + risk_factors
+        # ---- Visi-Veritas Hard-Halt Check ----
+        veritas_data = analysis_result.get("visi_veritas", {})
+        confidence = veritas_data.get("confidence_score", 100)
+        if confidence < 30:
+            raise HTTPException(
+                status_code=400,
+                detail={
+                    "error": "Visi-Veritas validation failed: The extracted financial data contains critical inaccuracies.",
+                    "confidence_score": confidence,
+                    "failed_rules": veritas_data.get("failed_rules", []),
+                    "warnings": veritas_data.get("warnings", []),
+                    "debug_context": veritas_data.get("debug_context", {}),
+                }
+            )
+        result_package = StandardizedDataPackage(
+            raw_data=report,
+            kpis=analysis_result["kpis"],
+            risk_analysis=analysis_result["risk_analysis"],
+            health_score=analysis_result["health_score"],
+            insights=all_insights,
+            runway_forecast=analysis_result["runway_forecast"],
+            optimization_insights=analysis_result["optimization_insights"],
+            geo_analysis=analysis_result.get("geo_analysis"),
+            visi_veritas=VisiVeritasReport(**veritas_data) if veritas_data else None,
+        )
+        # Save to DB
+        db_analysis = Analysis(
+            user_id=current_user.id,
+            company_name=report.company_name,
+            input_filename=file.filename,
+            stored_filename=object_key,
+            result_json=result_package.json()
+        )
+        db.add(db_analysis)
+        db.commit()
+        db.refresh(db_analysis)
+        result_package.analysis_id = db_analysis.id
+        result_package.timestamp = db_analysis.timestamp.isoformat()
+        result_package.runner_name = current_user.full_name or current_user.email
+        # Increment upload count AFTER successful save
+        increment_upload_count(db, current_user)
+        return result_package
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
+# =============================================================================
+# XLSX UPLOAD ENDPOINT
+# =============================================================================
+@router.post("/upload/xlsx", response_model=StandardizedDataPackage)
+async def analyze_xlsx(
+    file: UploadFile = File(...),
+    current_user: User = Depends(get_current_user),
+    db: Session = Depends(get_db)
+):
+    """Upload and analyze an Excel (.xlsx, .xls) file."""
+    # Check upload limit
+    from app.services.feature_service import check_upload_limit, increment_upload_count
+    limit_check = check_upload_limit(db, current_user)
+    if not limit_check["can_upload"]:
+        raise HTTPException(
+            status_code=403,
+            detail=f"Monthly upload limit reached ({limit_check['uploads_limit']} uploads). Upgrade your plan for more uploads. Resets on {limit_check['reset_date'][:10]}."
+        )
+    if not (file.filename.endswith('.xlsx') or file.filename.endswith('.xls')):
+        raise HTTPException(status_code=400, detail="Invalid file type. Please upload an .xlsx or .xls file.")
+    import uuid
+    safe_filename = f"{uuid.uuid4()}_{file.filename}"
+    object_key = f"uploads/{safe_filename}"
+    try:
+        # Upload to R2
+        await StorageService.upload_file(file, object_key)
+        temp_path = f"/tmp/{safe_filename}"
+        file.file.seek(0)
+        with open(temp_path, "wb") as f:
+            import shutil
+            shutil.copyfileobj(file.file, f)
+        # Use XLSX Parser
+        from app.services.ingestion.parser_xlsx import XLSXParser
+        report = await run_in_threadpool(XLSXParser.parse, temp_path)
+        # Cleanup
+        if os.path.exists(temp_path):
+            os.remove(temp_path)
+        # Run Unified Analysis
+        # Select Engine based on User Preference
+        analyzer = AnalysisFactory.get_analyzer(current_user)
+        # Resolve all feature flags (Plan + Custom + Engine limits)
+        from app.services.feature_service import resolve_user_features
+        enabled_features = resolve_user_features(db, current_user)
+        analysis_result = await run_in_threadpool(analyzer.analyze, report, user_address=current_user.address, enabled_features=enabled_features)
+        risk_factors = analysis_result["risk_analysis"].risk_factors if analysis_result.get("risk_analysis") else []
+        all_insights = analysis_result["insights"] + analysis_result["recommendations"] + risk_factors
+        # ---- Visi-Veritas Hard-Halt Check ----
+        veritas_data = analysis_result.get("visi_veritas", {})
+        confidence = veritas_data.get("confidence_score", 100)
+        if confidence < 30:
+            raise HTTPException(
+                status_code=400,
+                detail={
+                    "error": "Visi-Veritas validation failed: The extracted financial data contains critical inaccuracies.",
+                    "confidence_score": confidence,
+                    "failed_rules": veritas_data.get("failed_rules", []),
+                    "warnings": veritas_data.get("warnings", []),
+                    "debug_context": veritas_data.get("debug_context", {}),
+                }
+            )
+        result_package = StandardizedDataPackage(
+            raw_data=report,
+            kpis=analysis_result["kpis"],
+            risk_analysis=analysis_result["risk_analysis"],
+            health_score=analysis_result["health_score"],
+            insights=all_insights,
+            runway_forecast=analysis_result["runway_forecast"],
+            optimization_insights=analysis_result["optimization_insights"],
+            geo_analysis=analysis_result.get("geo_analysis"),
+            visi_veritas=VisiVeritasReport(**veritas_data) if veritas_data else None,
+        )
+        # Save to DB
+        db_analysis = Analysis(
+            user_id=current_user.id,
+            company_name=report.company_name,
+            input_filename=file.filename,
+            stored_filename=object_key,
+            result_json=result_package.json()
+        )
+        db.add(db_analysis)
+        db.commit()
+        db.refresh(db_analysis)
+        result_package.analysis_id = db_analysis.id
+        result_package.timestamp = db_analysis.timestamp.isoformat()
+        result_package.runner_name = current_user.full_name or current_user.email
+        # Increment upload count
+        increment_upload_count(db, current_user)
+        return result_package
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"XLSX Analysis failed: {str(e)}")
+# =============================================================================
+# BULK DELETE ENDPOINTS
+# =============================================================================
+class BulkDeleteRequest(BaseModel):
+    ids: list[int]
+@router.delete("/history/bulk-delete")
+def bulk_delete_analyses(
+    request: BulkDeleteRequest,
+    current_user: User = Depends(get_current_user),
+    db: Session = Depends(get_db)
+):
+    """Delete multiple analyses at once."""
+    deleted_count = 0
+    errors = []
+    for analysis_id in request.ids:
+        analysis = db.query(Analysis).filter(
+            Analysis.id == analysis_id,
+            Analysis.user_id == current_user.id
+        ).first()
+        if not analysis:
+            errors.append(f"Analysis {analysis_id} not found")
+            continue
+        # Delete file from disk
+        if analysis.stored_filename:
+            StorageService.delete_file(analysis.stored_filename)
+        db.delete(analysis)
+        deleted_count += 1
+    db.commit()
+    return {
+        "status": "success",
+        "deleted_count": deleted_count,
+        "errors": errors if errors else None
+    }
+class DateRangeDeleteRequest(BaseModel):
+    start_date: str  # YYYY-MM-DD
+    end_date: str    # YYYY-MM-DD
+@router.delete("/history/delete-range")
+def delete_analyses_in_range(
+    request: DateRangeDeleteRequest,
+    current_user: User = Depends(get_current_user),
+    db: Session = Depends(get_db)
+):
+    """Delete all analyses within a date range."""
+    from datetime import datetime
+    try:
+        start = datetime.strptime(request.start_date, "%Y-%m-%d")
+        end = datetime.strptime(request.end_date, "%Y-%m-%d").replace(hour=23, minute=59, second=59)
+    except ValueError:
+        raise HTTPException(status_code=400, detail="Invalid date format. Use YYYY-MM-DD.")
+    # Find analyses in range
+    analyses = db.query(Analysis).filter(
+        Analysis.user_id == current_user.id,
+        Analysis.timestamp >= start,
+        Analysis.timestamp <= end
+    ).all()
+    deleted_count = 0
+    for analysis in analyses:
+        if analysis.stored_filename:
+             StorageService.delete_file(analysis.stored_filename)
+        db.delete(analysis)
+        deleted_count += 1
+    db.commit()
+    return {
+        "status": "success",
+        "deleted_count": deleted_count,
+        "date_range": f"{request.start_date} to {request.end_date}"
+    }
+class SimulationRequest(BaseModel):
+    data: StandardizedDataPackage
+    delta_revenue: float = 0.0
+    delta_cogs: float = 0.0
+    delta_payroll: float = 0.0
+    delta_marketing: float = 0.0
+    delta_fixed_costs: float = 0.0
+@router.post("/simulate", response_model=StandardizedDataPackage)
+async def run_simulation(request: SimulationRequest, user: str = Depends(get_current_user)):
+    return SimulationService.run_simulation(
+        original_data=request.data.raw_data,
+        delta_revenue_percent=request.delta_revenue,
+        delta_cogs_percent=request.delta_cogs,
+        delta_payroll_percent=request.delta_payroll,
+        delta_marketing_percent=request.delta_marketing,
+        delta_fixed_costs_percent=request.delta_fixed_costs
+    )
+@router.get("/history/{analysis_id}/export/pdf")
+def export_analysis_pdf(
+    analysis_id: int,
+    current_user: User = Depends(get_current_user),
+    db: Session = Depends(get_db)
+):
+    from fastapi.responses import FileResponse
+    analysis = db.query(Analysis).filter(Analysis.id == analysis_id, Analysis.user_id == current_user.id).first()
+    if not analysis:
+        raise HTTPException(status_code=404, detail="Analysis not found")
+    # parse stored json
+    try:
+        data = StandardizedDataPackage.parse_raw(analysis.result_json)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Data corruption: {str(e)}")
+    # Generate PDF
+    # We use /tmp for now, simplified
+    safe_name = "".join(x for x in data.raw_data.company_name if x.isalnum() or x in " _-")
+    filename = f"/tmp/{safe_name}_{analysis.id}_report.pdf"
+    PDFReporter.generate(data, filename)
+    from datetime import datetime
+    date_str = datetime.now().strftime("%Y-%m-%d")
+    return FileResponse(filename, media_type='application/pdf', filename=f"Visi-Insight Report - {data.raw_data.company_name} - {date_str}.pdf")
+@router.post("/ai-cfo", response_model=str)
+async def get_ai_summary(data: StandardizedDataPackage, user: str = Depends(get_current_user)):
+    from app.services.intelligence.ai_cfo import AICFOService
+    return AICFOService.generate_executive_summary(data)
+@router.post("/chat", response_model=ChatResponse)
+async def chat_with_data(request: ChatRequest, user: str = Depends(get_current_user)):
+    # Note: In a real app, 'data_context' would be retrieved from a session or vector DB
+    # For this stateless scaffold, we assume we want to query a mock global context or previously uploaded file.
+    # To keep it simple for the frontend demo, we will accept the data in the request or just mock the context access
+    # since we don't have a persistent session store implemented yet.
+    # Check if a file was recently uploaded (using a global for demo simplicity, or pass mock)
+    # Ideally, we'd pass the DataPackage in the request, but it's too big.
+    # We will instantiate a dummy context if none exists, or rely on client sending relevant context.
+    # PROPER IMPLEMENTATION:
+    # 1. User uploads file -> Backend stores Vector Index ID in User Session.
+    # 2. /chat -> retrieves Index ID -> Queries Vector DB.
+    # MOCK IMPLEMENTATION:
+    from app.schemas.financial import StandardizedDataPackage, FinancialReport, IncomeStatementStandard, BalanceSheetStandard, CashFlowStandard, KPIMetrics, RiskAnalysis, HealthScoreBreakdown
+    from datetime import date
+    # Create a dummy context for the scaffold to prove the endpoint works
+    # In production, this would be retrieved from session or vector DB
+    dummy_data = StandardizedDataPackage(
+        raw_data=FinancialReport(
+            company_name="Demo Corp",
+            period_end=date.today(),
+            income_statement=IncomeStatementStandard(revenue=1200000, net_income=240000, cogs=600000),
+            balance_sheet=BalanceSheetStandard(),
+            cash_flow=CashFlowStandard()
+        ),
+        kpis=KPIMetrics(net_margin=20.0),
+        risk_analysis=RiskAnalysis(risk_score=85, risk_factors=[], liquidity_risk="Low", solvency_risk="Low"),
+        health_score=HealthScoreBreakdown(stability=20, profitability=20, growth=20, efficiency=20, total_score=80),
+        insights=["Automated Report Generation Successful"],
+        optimization_insights=None # Should be populated normally
+    )
+    from app.services.intelligence.gemini_service import GeminiService
+    return GeminiService.query(request, dummy_data)
+@router.get("/export/pptx/{company_name}")
+async def export_pptx(company_name: str):
+    from fastapi.responses import FileResponse
+    dummy_data = StandardizedDataPackage(
+        raw_data=FinancialReport(
+            company_name=company_name,
+            period_end=date.today(),
+            income_statement=IncomeStatementStandard(revenue=1000000, net_income=200000, cogs=500000),
+            balance_sheet=BalanceSheetStandard(),
+            cash_flow=CashFlowStandard()
+        ),
+        kpis=KPIMetrics(net_margin=20.0),
+        risk_analysis=RiskAnalysis(risk_score=85, risk_factors=[], liquidity_risk="Low", solvency_risk="Low"),
+        health_score=HealthScoreBreakdown(stability=20, profitability=20, growth=20, efficiency=20, total_score=80),
+        insights=["Automated Report Generation Successful"]
+    )
+    filename = f"/tmp/{company_name}_presentation.pptx"
+    PPTXReporter.generate(dummy_data, filename)
+    return FileResponse(filename, media_type='application/vnd.openxmlformats-officedocument.presentationml.presentation', filename=f"{company_name}_presentation.pptx")
+class EngineUpdate(BaseModel):
+    engine: str
+@router.put("/admin/users/{user_id}/engine")
+def update_user_engine(
+    user_id: int,
+    update: EngineUpdate,
+    admin: User = Depends(get_current_admin),
+    db: Session = Depends(get_db)
+):
+    user = db.query(User).filter(User.id == user_id).first()
+    if not user:
+        raise HTTPException(status_code=404, detail="User not found")
+    if update.engine not in ["v1", "v2"]:
+        raise HTTPException(status_code=400, detail="Invalid engine. Use 'v1' or 'v2'.")
+    user.preferred_engine = update.engine
+    db.commit()
+    return {"status": "success", "engine": user.preferred_engine}
+@router.get("/public-config")
+def get_public_config(db: Session = Depends(get_db)):
+    """Get configuration for Guest/Public users."""
+    from app.services.feature_service import get_effective_features
+    return {
+        "guest_features": get_effective_features(db, "Guest"),
+        "upload_limit": 2
+    }

app/api/visilok.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from fastapi import APIRouter, Depends
+from app.api.auth import get_current_user
+from app.models.user import User
+import os
+router = APIRouter(prefix="/visilok", tags=["security"])
+def get_current_admin(current_user: User = Depends(get_current_user)):
+    if not current_user.is_admin:
+        from fastapi import HTTPException
+        raise HTTPException(status_code=403, detail="Admin privileges required for Visi-Lok monitor.")
+    return current_user
+@router.get("/status")
+def get_security_status(admin: User = Depends(get_current_admin)):
+    """
+    Visi-Lok: Security & Auth Engine Monitor
+    Reports on the current encryption and security thresholds of the system.
+    """
+    # 1. Database Encryption
+    # In production environments like Render, PostgreSQL is encrypted at rest via AES-256.
+    # We verify if we are running in the cloud vs local.
+    is_production = os.getenv("RENDER") == "true" or os.getenv("NODE_ENV") == "production"
+    db_status = {
+        "encrypted_at_rest": is_production,
+        "algorithm": "AES-256" if is_production else "None (Local)",
+        "provider": "Render/AWS KMS" if is_production else "Local Disk"
+    }
+    # 2. Storage Encryption (Cloudflare R2 defaults to AES-256)
+    r2_configured = bool(os.getenv("R2_ACCOUNT_ID"))
+    storage_status = {
+        "encrypted_at_rest": r2_configured,
+        "algorithm": "AES-256" if r2_configured else "None",
+        "provider": "Cloudflare R2" if r2_configured else "Local Storage"
+    }
+    # 3. Transport Layer Security
+    # We enforce HTTPS via proxies in production
+    tls_status = {
+        "enforced": True, # Usually handled by Render/Vercel load balancers
+        "protocol": "TLS 1.3/1.2"
+    }
+    # 4. Auth & Row-Level Security
+    # Verified through Depends(get_current_user) and DB level filtering
+    auth_status = {
+        "row_level_security": "Active",
+        "jwt_encryption": True,
+        "password_hashing": "Argon2",
+        "mfa_enabled": False # Future roadmap
+    }
+    overall_secure = db_status["encrypted_at_rest"] and storage_status["encrypted_at_rest"]
+    return {
+        "system": "Visi-Lok Security Monitor",
+        "status": "SECURE" if overall_secure else "WARNING: UNENCRYPTED COMPONENTS DETECTED",
+        "checks": {
+            "database": db_status,
+            "object_storage": storage_status,
+            "transport": tls_status,
+            "authentication": auth_status
+        }
+    }

app/core/__init__.py ADDED Viewed

	@@ -0,0 +1,52 @@

+"""
+Core Configuration Package
+This package contains application-wide configuration and utilities.
+## Modules
+- `config.py` - Environment variables and settings
+- `database.py` - SQLAlchemy engine and session
+- `security.py` - JWT token creation/validation
+- `feature_registry.py` - Centralized feature definitions (auto-discoverable)
+- `plan_config.py` - Plan limits and default feature sets
+## Feature System Architecture
+The feature system uses a layered approach:
+1. **Feature Registry** (`feature_registry.py`)
+   - Defines ALL controllable features
+   - Features auto-appear in admin console
+   - Organized by category for easy navigation
+2. **Plan Config** (`plan_config.py`)
+   - Default features per plan tier
+   - Upload limits per plan
+   - Wildcard "*" for unlimited access
+3. **Admin Overrides** (via `models/feature_flags.py`)
+   - Stored in database
+   - Takes precedence over defaults
+   - Managed via admin API
+## Adding New Features
+```python
+# In feature_registry.py, add to FEATURE_REGISTRY:
+Feature(
+    id="new_feature_id",
+    name="New Feature Name",
+    description="What this feature does",
+    category=FeatureCategory.CORE_METRICS  # Pick appropriate category
+)
+```
+The feature will automatically:
+- Appear in admin console UI
+- Be toggleable per plan
+- Respect plan defaults until overridden
+"""
+from app.core.feature_registry import FEATURE_REGISTRY, Feature, FeatureCategory
+from app.core.plan_config import PLAN_DEFAULTS, get_plan_config, get_default_features

app/core/config.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from pydantic_settings import BaseSettings, SettingsConfigDict
+from pydantic import field_validator
+from typing import List, Union, Optional
+class Settings(BaseSettings):
+    # Application Config
+    PROJECT_NAME: str = "Visique API"
+    VERSION: str = "0.1.0"
+    API_V1_STR: str = "/api/v1"
+    # Security
+    SECRET_KEY: str  # Required in production
+    ALGORITHM: str = "HS256"
+    ACCESS_TOKEN_EXPIRE_MINUTES: int = 1440  # 24 hours for better UX
+    # Database
+    DATABASE_URL: str  # PostgreSQL URL
+    # CORS
+    ALLOWED_ORIGINS: Union[List[str], str] = [
+        "http://localhost:3000",
+        "http://127.0.0.1:3000",
+        "https://visique-testing.vercel.app",
+        "https://visique-frontend.vercel.app"
+    ]
+    @field_validator("ALLOWED_ORIGINS", mode="before")
+    @classmethod
+    def assemble_cors_origins(cls, v: Union[str, List[str]]) -> Union[List[str], str]:
+        if isinstance(v, str) and not v.startswith("["):
+            return [i.strip() for i in v.split(",")]
+        elif isinstance(v, str) and v.startswith("["):
+            import json
+            return json.loads(v)
+        elif isinstance(v, list):
+            return v
+        raise ValueError(v)
+    # Stripe
+    STRIPE_SECRET_KEY: Optional[str] = None
+    STRIPE_PUBLISHABLE_KEY: Optional[str] = None
+    STRIPE_WEBHOOK_SECRET: Optional[str] = None
+    # Deployment
+    ENVIRONMENT: str = "development"
+    # Cloudflare R2 Storage
+    R2_ACCOUNT_ID: Optional[str] = None
+    R2_ACCESS_KEY_ID: Optional[str] = None
+    R2_SECRET_ACCESS_KEY: Optional[str] = None
+    R2_BUCKET_NAME: Optional[str] = None
+    # Dolphin PDF Extraction
+    DOLPHIN_MODEL_PATH: Optional[str] = None  # Auto-downloads if None
+    DOLPHIN_DEVICE: str = "auto"  # "auto" (CUDA > MPS > CPU) | "cuda" | "mps" | "cpu"
+    DOLPHIN_MAX_BATCH_SIZE: int = 4
+    DOLPHIN_AUTO_DOWNLOAD: bool = True
+    # Dolphin Remote Service (Optional - for distributed setup)
+    DOLPHIN_API_URL: Optional[str] = None
+    DOLPHIN_API_KEY: Optional[str] = None
+    # Cloudflare R2 Storage
+    R2_ACCOUNT_ID: Optional[str] = None
+    R2_ACCESS_KEY_ID: Optional[str] = None
+    R2_SECRET_ACCESS_KEY: Optional[str] = None
+    R2_BUCKET_NAME: Optional[str] = None
+    model_config = SettingsConfigDict(env_file=".env", case_sensitive=True, extra="ignore")
+settings = Settings()

app/core/database.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from sqlalchemy import create_engine
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy.orm import sessionmaker
+from app.core.config import settings
+SQLALCHEMY_DATABASE_URL = settings.DATABASE_URL
+# Fix for Render/SQLAlchemy postgres:// scheme
+if SQLALCHEMY_DATABASE_URL.startswith("postgres://"):
+    SQLALCHEMY_DATABASE_URL = SQLALCHEMY_DATABASE_URL.replace("postgres://", "postgresql://", 1)
+# PostgreSQL-specific connect_args (keepalives break SQLite)
+_is_sqlite = SQLALCHEMY_DATABASE_URL.startswith("sqlite")
+_engine_kwargs: dict = dict(
+    pool_pre_ping=True,
+)
+if not _is_sqlite:
+    _engine_kwargs.update(
+        pool_recycle=280,
+        pool_size=5,
+        max_overflow=10,
+        connect_args={
+            "keepalives": 1,
+            "keepalives_idle": 30,
+            "keepalives_interval": 10,
+            "keepalives_count": 5,
+            "connect_timeout": 10,
+        },
+    )
+else:
+    _engine_kwargs["connect_args"] = {"check_same_thread": False}
+engine = create_engine(SQLALCHEMY_DATABASE_URL, **_engine_kwargs)
+SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
+Base = declarative_base()
+def get_db():
+    db = SessionLocal()
+    try:
+        yield db
+    finally:
+        db.close()

app/core/feature_registry.py ADDED Viewed

	@@ -0,0 +1,266 @@

+"""
+Feature Registry - Auto-Discoverable Feature System
+Add new features here and they will automatically appear in the admin console.
+Each feature belongs to a category and can be toggled per plan.
+"""
+from enum import Enum
+from dataclasses import dataclass, field
+from typing import List, Dict, Optional
+class FeatureCategory(Enum):
+    """Categories for organizing features in admin console"""
+    CORE_METRICS = "Core Metrics"
+    RISK_ANALYSIS = "Risk Analysis"
+    FORECASTING = "Forecasting"
+    AI_INTELLIGENCE = "AI Intelligence"
+    INTERACTIVE = "Interactive Tools"
+    EXPORTS = "Exports & Reports"
+@dataclass
+class Feature:
+    """
+    Represents a controllable feature in the system.
+    Attributes:
+        id: Unique identifier used in code and API
+        name: Human-readable name for admin console
+        category: Grouping category
+        description: Brief description of the feature
+        default_enabled: Whether enabled by default for new plans
+    """
+    id: str
+    name: str
+    category: FeatureCategory
+    description: str
+    default_enabled: bool = True
+    memory_cost_mb: int = 5  # Estimated RAM usage in MB
+# =============================================================================
+# FEATURE REGISTRY - ADD NEW FEATURES HERE
+# =============================================================================
+# When adding new financial model outputs, add a Feature entry below.
+# It will automatically appear in the admin console under the correct category.
+# =============================================================================
+FEATURE_REGISTRY: List[Feature] = [
+    # -------------------------------------------------------------------------
+    # Core Metrics
+    # -------------------------------------------------------------------------
+    Feature(
+        id="kpi_margins",
+        name="Profit Margins (Gross/Operating/Net)",
+        category=FeatureCategory.CORE_METRICS,
+        description="Core margin KPIs from income statement",
+        memory_cost_mb=2
+    ),
+    Feature(
+        id="kpi_ratios",
+        name="Financial Ratios",
+        category=FeatureCategory.CORE_METRICS,
+        description="Current ratio, debt-to-equity, quick ratio",
+        memory_cost_mb=2
+    ),
+    Feature(
+        id="health_score",
+        name="Health Score Dashboard",
+        category=FeatureCategory.CORE_METRICS,
+        description="Overall financial health scoring (stability, profitability, growth, efficiency)"
+    ),
+    # -------------------------------------------------------------------------
+    # Risk Analysis
+    # -------------------------------------------------------------------------
+    Feature(
+        id="risk_score",
+        name="Risk Score",
+        category=FeatureCategory.RISK_ANALYSIS,
+        description="Aggregate risk scoring (0-100)",
+        memory_cost_mb=5
+    ),
+    Feature(
+        id="risk_factors",
+        name="Risk Factor Breakdown",
+        category=FeatureCategory.RISK_ANALYSIS,
+        description="Detailed list of identified risk factors"
+    ),
+    Feature(
+        id="liquidity_risk",
+        name="Liquidity Risk",
+        category=FeatureCategory.RISK_ANALYSIS,
+        description="Cash flow and working capital risk assessment"
+    ),
+    Feature(
+        id="solvency_risk",
+        name="Solvency Risk",
+        category=FeatureCategory.RISK_ANALYSIS,
+        description="Long-term debt sustainability analysis"
+    ),
+    # -------------------------------------------------------------------------
+    # Forecasting
+    # -------------------------------------------------------------------------
+    Feature(
+        id="runway_forecast",
+        name="Cash Runway Forecast",
+        category=FeatureCategory.FORECASTING,
+        description="30/60/90 day cash projections"
+    ),
+    Feature(
+        id="burn_rate",
+        name="Burn Rate Analysis",
+        category=FeatureCategory.FORECASTING,
+        description="Monthly cash burn rate calculation"
+    ),
+    Feature(
+        id="optimization_insights",
+        name="Optimization Insights",
+        category=FeatureCategory.FORECASTING,
+        description="Dead zones, peak premiums, cost optimization"
+    ),
+    Feature(
+        id="budget_variance",
+        name="Budget Variance Analysis",
+        category=FeatureCategory.FORECASTING,
+        description="Target vs actual comparison"
+    ),
+    # -------------------------------------------------------------------------
+    # AI Intelligence
+    # -------------------------------------------------------------------------
+    Feature(
+        id="ai_cfo",
+        name="AI CFO Chat",
+        category=FeatureCategory.AI_INTELLIGENCE,
+        description="Conversational AI financial advisor",
+        memory_cost_mb=80
+    ),
+    Feature(
+        id="ai_summary",
+        name="AI Executive Summary",
+        category=FeatureCategory.AI_INTELLIGENCE,
+        description="Auto-generated narrative insights",
+        memory_cost_mb=60
+    ),
+    Feature(
+        id="geo_insights",
+        name="Geo-Strategic Insights",
+        category=FeatureCategory.AI_INTELLIGENCE,
+        description="Location-based market analysis",
+        memory_cost_mb=150
+    ),
+    Feature(
+        id="intelligence_card",
+        name="Strategic Intelligence Card",
+        category=FeatureCategory.AI_INTELLIGENCE,
+        description="AI-powered strategic recommendations",
+        memory_cost_mb=50
+    ),
+    # -------------------------------------------------------------------------
+    # Interactive Tools
+    # -------------------------------------------------------------------------
+    Feature(
+        id="what_if_slider",
+        name="What-If Simulator",
+        category=FeatureCategory.INTERACTIVE,
+        description="Revenue/cost scenario modeling with sliders"
+    ),
+    Feature(
+        id="interactive_charts",
+        name="Interactive Charts",
+        category=FeatureCategory.INTERACTIVE,
+        description="Zoomable, hoverable data visualizations"
+    ),
+    Feature(
+        id="trend_comparison",
+        name="Trend Comparison",
+        category=FeatureCategory.INTERACTIVE,
+        description="Period-over-period analysis"
+    ),
+    # -------------------------------------------------------------------------
+    # Exports & Reports
+    # -------------------------------------------------------------------------
+    Feature(
+        id="pdf_export",
+        name="PDF Report Export",
+        category=FeatureCategory.EXPORTS,
+        description="Downloadable PDF financial report"
+    ),
+    Feature(
+        id="pptx_export",
+        name="PowerPoint Export",
+        category=FeatureCategory.EXPORTS,
+        description="Presentation-ready slides"
+    ),
+    Feature(
+        id="csv_export",
+        name="Data Export (CSV)",
+        category=FeatureCategory.EXPORTS,
+        description="Raw data download for further analysis"
+    ),
+    # -------------------------------------------------------------------------
+    # Data Integrity
+    # -------------------------------------------------------------------------
+    Feature(
+        id="visi_veritas",
+        name="Visi-Veritas Audit",
+        category=FeatureCategory.CORE_METRICS,
+        description="32-rule financial data validation engine with confidence scoring",
+        memory_cost_mb=2
+    ),
+]
+# =============================================================================
+# HELPER FUNCTIONS
+# =============================================================================
+def get_all_features() -> List[Feature]:
+    """Returns all registered features."""
+    return FEATURE_REGISTRY
+def get_feature_by_id(feature_id: str) -> Optional[Feature]:
+    """Get a specific feature by its ID."""
+    for feature in FEATURE_REGISTRY:
+        if feature.id == feature_id:
+            return feature
+    return None
+def get_all_feature_ids() -> List[str]:
+    """Returns list of all feature IDs."""
+    return [f.id for f in FEATURE_REGISTRY]
+def get_features_by_category() -> Dict[str, List[Feature]]:
+    """Returns features grouped by category name."""
+    result: Dict[str, List[Feature]] = {}
+    for cat in FeatureCategory:
+        features = [f for f in FEATURE_REGISTRY if f.category == cat]
+        if features:
+            result[cat.value] = features
+    return result
+def get_default_enabled_features() -> List[str]:
+    """Returns IDs of features enabled by default."""
+    return [f.id for f in FEATURE_REGISTRY if f.default_enabled]
+def validate_feature_ids(feature_ids: List[str]) -> List[str]:
+    """
+    Validates a list of feature IDs against the registry.
+    Returns list of invalid IDs (empty if all valid).
+    """
+    valid_ids = set(get_all_feature_ids())
+    invalid = [fid for fid in feature_ids if fid not in valid_ids]
+    return invalid

app/core/migrations.py ADDED Viewed

	@@ -0,0 +1,111 @@

+"""
+Automatic Schema Migration Utility
+This module runs at startup to ensure database columns match the SQLAlchemy models.
+It adds any missing columns automatically, preventing 'UndefinedColumn' errors
+in production when new fields are added to models.
+"""
+from sqlalchemy import inspect, text
+from sqlalchemy.engine import Engine
+import logging
+logger = logging.getLogger(__name__)
+def get_model_columns(model_class):
+    """Extract column definitions from a SQLAlchemy model class."""
+    from sqlalchemy import Column
+    columns = {}
+    for attr_name in dir(model_class):
+        attr = getattr(model_class, attr_name, None)
+        if hasattr(attr, 'property') and hasattr(attr.property, 'columns'):
+            col = attr.property.columns[0]
+            columns[col.name] = col
+    return columns
+def get_db_columns(engine: Engine, table_name: str):
+    """Get existing column names from the database table."""
+    inspector = inspect(engine)
+    try:
+        return {col['name'] for col in inspector.get_columns(table_name)}
+    except Exception:
+        return set()
+def get_column_type_sql(column):
+    """Convert SQLAlchemy column type to SQL type string."""
+    from sqlalchemy import Boolean, Integer, String, DateTime, Text, Float, JSON
+    col_type = type(column.type)
+    type_map = {
+        Boolean: "BOOLEAN",
+        Integer: "INTEGER",
+        String: "VARCHAR(255)",
+        DateTime: "TIMESTAMP",
+        Text: "TEXT",
+        Float: "FLOAT",
+        JSON: "JSONB"  # PostgreSQL JSON type
+    }
+    # Check for String with specific length
+    if hasattr(column.type, 'length') and column.type.length:
+        return f"VARCHAR({column.type.length})"
+    return type_map.get(col_type, "TEXT")
+def get_default_sql(column):
+    """Get SQL DEFAULT clause for a column."""
+    if column.default is not None:
+        default_val = column.default.arg
+        if isinstance(default_val, bool):
+            return "DEFAULT FALSE" if not default_val else "DEFAULT TRUE"
+        elif isinstance(default_val, (int, float)):
+            return f"DEFAULT {default_val}"
+        elif isinstance(default_val, str):
+            return f"DEFAULT '{default_val}'"
+        elif isinstance(default_val, dict):
+            return "DEFAULT '{}'"
+    return ""
+def run_migrations(engine: Engine):
+    """
+    Check all models and add any missing columns to the database.
+    This runs at application startup.
+    """
+    from app.models.user import User, Analysis, Payment
+    from app.models.feature_flags import PlanFeatureOverride, PlanUploadLimit
+    models = [User, Analysis, Payment, PlanFeatureOverride, PlanUploadLimit]
+    for model in models:
+        table_name = model.__tablename__
+        model_cols = get_model_columns(model)
+        db_cols = get_db_columns(engine, table_name)
+        if not db_cols:
+            # Table doesn't exist yet, let create_all handle it
+            logger.info(f"Table '{table_name}' not found, will be created by create_all()")
+            continue
+        missing_cols = set(model_cols.keys()) - db_cols
+        for col_name in missing_cols:
+            col = model_cols[col_name]
+            col_type = get_column_type_sql(col)
+            default_clause = get_default_sql(col)
+            sql = f'ALTER TABLE {table_name} ADD COLUMN {col_name} {col_type} {default_clause}'
+            try:
+                with engine.connect() as conn:
+                    conn.execute(text(sql))
+                    conn.commit()
+                logger.info(f"✓ Added column '{col_name}' to table '{table_name}'")
+            except Exception as e:
+                # Column might already exist or other issue
+                logger.warning(f"Could not add column '{col_name}' to '{table_name}': {e}")

app/core/plan_config.py ADDED Viewed

	@@ -0,0 +1,192 @@

+"""
+Plan Configuration - Default settings for each subscription plan.
+This module defines upload limits and default feature access per plan.
+Admins can override these defaults via the admin console.
+"""
+from typing import Dict, List, Any
+from .feature_registry import get_all_feature_ids
+# =============================================================================
+# PLAN CONFIGURATION
+# =============================================================================
+# Each plan has:
+#   - upload_limit: Monthly upload cap
+#   - is_session: True for guest/anonymous (session-based tracking)
+#   - features: List of enabled feature IDs, or ["*"] for all features
+# =============================================================================
+PLAN_DEFAULTS: Dict[str, Dict[str, Any]] = {
+    # Guest users on /try page (session-based, no account)
+    "Guest": {
+        "upload_limit": 2,
+        "is_session": True,
+        "features": [
+            "kpi_margins",
+            "health_score",
+            "risk_score",
+            "pdf_export"
+        ]
+    },
+    # Free trial - full Small Business experience for 1 month
+    "Free Trial": {
+        "upload_limit": 15,
+        "is_session": False,
+        "features": [
+            "kpi_margins",
+            "kpi_ratios",
+            "health_score",
+            "risk_score",
+            "risk_factors",
+            "runway_forecast",
+            "burn_rate",
+            "interactive_charts",
+            "pdf_export"
+        ]
+    },
+    # Individual plan - $9/month
+    "Individual": {
+        "upload_limit": 5,
+        "is_session": False,
+        "features": [
+            "kpi_margins",
+            "kpi_ratios",
+            "health_score",
+            "risk_score",
+            "risk_factors",
+            "pdf_export"
+        ]
+    },
+    # Organization plan - $49/month
+    "Organization": {
+        "upload_limit": 10,
+        "is_session": False,
+        "features": [
+            "kpi_margins",
+            "kpi_ratios",
+            "health_score",
+            "risk_score",
+            "risk_factors",
+            "liquidity_risk",
+            "runway_forecast",
+            "ai_summary",
+            "interactive_charts",
+            "pdf_export"
+        ]
+    },
+    # Small Business plan - $99/month
+    "Small Business": {
+        "upload_limit": 15,
+        "is_session": False,
+        "features": ["*"]  # All features
+    },
+    # Mid Business plan - $249/month
+    "Mid Business": {
+        "upload_limit": 25,
+        "is_session": False,
+        "features": ["*"]  # All features
+    },
+    # Large Business / Enterprise - $499+/month
+    "Large Business": {
+        "upload_limit": 50,
+        "is_session": False,
+        "features": ["*"]  # All features
+    },
+    # Admin users - unlimited access
+    "Admin": {
+        "upload_limit": 999999,
+        "is_session": False,
+        "features": ["*"]
+    },
+    # Engine Configs (Treated as Plans for feature flags)
+    "_ENGINE_v1": {
+        "upload_limit": 0,
+        "is_session": False,
+        "features": ["*"]
+    },
+    "_ENGINE_v2": {
+        "upload_limit": 0,
+        "is_session": False,
+        "features": [
+            "kpi_margins", "kpi_ratios", "health_score", "risk_score", "risk_factors",
+            "runway_forecast", "burn_rate", "interactive_charts", "pdf_export",
+            "ai_summary", "intelligence_card"
+            # Note: Geo Insights and AI CFO omitted by default for Lite Engine
+        ]
+    }
+}
+# Special "Plan" names for Engine Feature Configuration
+ENGINE_PLANS = ["_ENGINE_v1", "_ENGINE_v2"]
+# Mappings for UI display
+ENGINE_DISPLAY_NAMES = {
+    "_ENGINE_v1": "Visi-Insight-1 (Standard)",
+    "_ENGINE_v2": "Visi-Insight-2 (Lite)"
+}
+# =============================================================================
+# HELPER FUNCTIONS
+# =============================================================================
+def get_plan_config(plan_name: str) -> Dict[str, Any]:
+    """
+    Get configuration for a specific plan.
+    Falls back to Individual if plan not found.
+    """
+    return PLAN_DEFAULTS.get(plan_name, PLAN_DEFAULTS["Individual"])
+def get_upload_limit(plan_name: str) -> int:
+    """Get the monthly upload limit for a plan."""
+    config = get_plan_config(plan_name)
+    return config.get("upload_limit", 5)
+def get_default_features(plan_name: str) -> List[str]:
+    """
+    Get list of enabled feature IDs for a plan.
+    Expands ["*"] to all feature IDs.
+    """
+    config = get_plan_config(plan_name)
+    features = config.get("features", [])
+    if "*" in features:
+        return get_all_feature_ids()
+    return features
+def is_session_based(plan_name: str) -> bool:
+    """Check if plan uses session-based tracking (for guests)."""
+    config = get_plan_config(plan_name)
+    return config.get("is_session", False)
+def get_all_plans() -> List[str]:
+    """Returns list of all plan names."""
+    return list(PLAN_DEFAULTS.keys())
+def get_billable_plans() -> List[str]:
+    """Returns plans that are actual subscription tiers (excludes Guest/Admin)."""
+    return [p for p in PLAN_DEFAULTS.keys() if p not in ("Guest", "Admin")]
+def get_all_engines() -> List[str]:
+    """Returns list of engine identifier keys."""
+    return ENGINE_PLANS

app/core/security.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from datetime import datetime, timedelta
+from typing import Optional
+from jose import JWTError, jwt
+from passlib.context import CryptContext
+from app.core.config import settings
+# Config
+SECRET_KEY = settings.SECRET_KEY
+ALGORITHM = settings.ALGORITHM
+ACCESS_TOKEN_EXPIRE_MINUTES = settings.ACCESS_TOKEN_EXPIRE_MINUTES
+pwd_context = CryptContext(schemes=["argon2"], deprecated="auto")
+def verify_password(plain_password, hashed_password):
+    return pwd_context.verify(plain_password, hashed_password)
+def get_password_hash(password):
+    return pwd_context.hash(password)
+def create_access_token(data: dict, expires_delta: Optional[timedelta] = None):
+    to_encode = data.copy()
+    if expires_delta:
+        expire = datetime.utcnow() + expires_delta
+    else:
+        expire = datetime.utcnow() + timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
+    to_encode.update({"exp": expire})
+    encoded_jwt = jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM)
+    return encoded_jwt

app/core/stripe_config.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import stripe
+from app.core.config import settings
+stripe.api_key = settings.STRIPE_SECRET_KEY
+def create_checkout_session(db_user, plan_id: str):
+    try:
+        checkout_session = stripe.checkout.Session.create(
+            customer_email=db_user.email,
+            client_reference_id=str(db_user.id),
+            payment_method_types=['card'],
+            line_items=[
+                {
+                    'price': plan_id,
+                    'quantity': 1,
+                },
+            ],
+            mode='subscription',
+            success_url=f"{settings.ALLOWED_ORIGINS[0]}/dashboard?session_id={{CHECKOUT_SESSION_ID}}",
+            cancel_url=f"{settings.ALLOWED_ORIGINS[0]}/pricing",
+            metadata={
+                'user_id': db_user.id,
+                'plan_name': 'Business' # Or derive from plan_id
+            }
+        )
+        return checkout_session
+    except Exception as e:
+        print(f"Stripe Error: {e}")
+        return None

app/main.py ADDED Viewed

	@@ -0,0 +1,127 @@

+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+app = FastAPI(
+    title="Visique API",
+    description="Financial Data Analyzer Backend",
+    version="0.1.2" # Bump version to clear previous failure
+)
+from app.core.config import settings
+# CORS Configuration
+# Ensure Vercel domains are allowed even if env vars override config defaults
+origins = []
+if isinstance(settings.ALLOWED_ORIGINS, list):
+    origins.extend(settings.ALLOWED_ORIGINS)
+else:
+    origins.append(str(settings.ALLOWED_ORIGINS))
+extra_origins = [
+    "https://visique-testing.vercel.app",
+    "https://visique-frontend.vercel.app",
+    # Specific current previews
+    "https://visique-testing-7qdi0vaqf-sams-projects-85f65c65.vercel.app",
+    "https://visique-testing-fky1isli2-sams-projects-85f65c65.vercel.app"
+]
+for origin in extra_origins:
+    if origin not in origins:
+        origins.append(origin)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=origins,
+    # Allow any Vercel preview domain for this specific project
+    allow_origin_regex=r"https://visique-testing-.*-sams-projects-85f65c65\.vercel\.app",
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.get("/")
+async def root():
+    return {"message": "Welcome to Visique Financial Analyzer API"}
+@app.get("/health")
+async def health_check():
+    return {"status": "healthy"}
+from app.api.endpoints import router as analysis_router
+from app.api.auth import router as auth_router
+from app.core.database import engine, Base
+# Run Automatic Schema Migrations (adds missing columns)
+from app.core.migrations import run_migrations
+run_migrations(engine)
+# Create Tables (for new tables only, migrations handles columns)
+Base.metadata.create_all(bind=engine)
+app.include_router(analysis_router, prefix="/api/v1")
+app.include_router(auth_router, prefix="/api/v1")
+from app.api.admin import router as admin_router
+app.include_router(admin_router, prefix="/api/v1")
+from app.api.visilok import router as visilok_router
+app.include_router(visilok_router, prefix="/api/v1")
+# Mount Static Files for Uploads
+from fastapi.staticfiles import StaticFiles
+import os
+# Ensure upload directory exists
+upload_dir = "uploads"
+if not os.path.exists(upload_dir):
+    os.makedirs(upload_dir)
+# Mount /api/v1/static to the uploads directory
+app.mount("/api/v1/static", StaticFiles(directory="uploads"), name="static")
+from sqlalchemy import text
+from app.core.database import SessionLocal
+# Startup Migration for V2 Engine Support
+@app.on_event("startup")
+def run_migrations():
+    try:
+        db = SessionLocal()
+        # Add preferred_engine column if it doesn't exist
+        db.execute(text("ALTER TABLE users ADD COLUMN IF NOT EXISTS preferred_engine VARCHAR DEFAULT 'v1'"))
+        db.commit()
+        db.close()
+        print("Startup Migration: Verified preferred_engine column.")
+    except Exception as e:
+        print(f"Startup Migration Warning: {e}")
+# Keep-Alive Background Task to prevent Render free tier from sleeping
+import asyncio
+import httpx
+async def keep_alive_task():
+    """Pings the health endpoint every 5 minutes to prevent cold starts."""
+    # Wait for initial startup to complete
+    await asyncio.sleep(60)
+    # Get the app URL from environment or use default
+    app_url = os.environ.get("RENDER_EXTERNAL_URL", "https://visique-backend.onrender.com")
+    health_url = f"{app_url}/health"
+    print(f"[Keep-Alive] Started. Pinging {health_url} every 5 minutes.")
+    async with httpx.AsyncClient() as client:
+        while True:
+            try:
+                response = await client.get(health_url, timeout=30)
+                print(f"[Keep-Alive] Ping successful: {response.status_code}")
+            except Exception as e:
+                print(f"[Keep-Alive] Ping failed: {e}")
+            # Wait 5 minutes (300 seconds) before next ping
+            await asyncio.sleep(300)
+@app.on_event("startup")
+async def start_keep_alive():
+    """Starts the keep-alive background task on app startup."""
+    asyncio.create_task(keep_alive_task())

app/models/feature_flags.py ADDED Viewed

	@@ -0,0 +1,59 @@

+"""
+Feature Flags Model - Admin-managed feature overrides per plan.
+This model stores per-plan feature overrides that take precedence
+over the defaults defined in plan_config.py.
+"""
+from sqlalchemy import Column, Integer, String, Boolean, DateTime, ForeignKey
+from sqlalchemy.orm import relationship
+from datetime import datetime
+from app.core.database import Base
+class PlanFeatureOverride(Base):
+    """
+    Stores admin overrides for feature availability per plan.
+    When checking if a feature is enabled for a plan:
+    1. Check if override exists in this table
+    2. If yes, use the override value
+    3. If no, fall back to plan_config.py defaults
+    """
+    __tablename__ = "plan_feature_overrides"
+    id = Column(Integer, primary_key=True, index=True)
+    plan_name = Column(String, index=True, nullable=False)
+    feature_id = Column(String, index=True, nullable=False)
+    enabled = Column(Boolean, default=True, nullable=False)
+    # Audit fields
+    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
+    updated_by_id = Column(Integer, ForeignKey("users.id"), nullable=True)
+    def __repr__(self):
+        status = "enabled" if self.enabled else "disabled"
+        return f"<PlanFeatureOverride {self.plan_name}:{self.feature_id}={status}>"
+class PlanUploadLimit(Base):
+    """
+    Stores admin overrides for upload limits per plan.
+    When checking upload limit for a plan:
+    1. Check if override exists in this table
+    2. If yes, use the override value
+    3. If no, fall back to plan_config.py defaults
+    """
+    __tablename__ = "plan_upload_limits"
+    id = Column(Integer, primary_key=True, index=True)
+    plan_name = Column(String, unique=True, index=True, nullable=False)
+    upload_limit = Column(Integer, nullable=False)
+    # Audit fields
+    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
+    updated_by_id = Column(Integer, ForeignKey("users.id"), nullable=True)
+    def __repr__(self):
+        return f"<PlanUploadLimit {self.plan_name}={self.upload_limit}>"

app/models/user.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import sqlalchemy
+from sqlalchemy import Column, Integer, String, DateTime, ForeignKey, Text, Boolean
+from sqlalchemy.orm import relationship
+from datetime import datetime
+from app.core.database import Base
+class User(Base):
+    __tablename__ = "users"
+    id = Column(Integer, primary_key=True, index=True)
+    email = Column(String, unique=True, index=True)
+    hashed_password = Column(String)
+    full_name = Column(String, nullable=True)
+    company_name = Column(String, nullable=True)
+    plan = Column(String, default="Free")
+    plan_expires_at = Column(DateTime, nullable=True)
+    is_admin = Column(Boolean, default=False)
+    is_super_admin = Column(Boolean, default=False)
+    created_at = Column(DateTime, default=datetime.utcnow)
+    # New Fields for Verification & Profile
+    visique_id = Column(String, unique=True, index=True, nullable=True) # Generated VSQ-XXXX
+    ein = Column(String, nullable=True)
+    address = Column(String, nullable=True)
+    profile_picture_url = Column(String, nullable=True)
+    industry = Column(String, default="General")
+    preferred_engine = Column(String, default="v1")  # "v1" (Standard) or "v2" (Lite)
+    # Upload Tracking
+    monthly_upload_count = Column(Integer, default=0)
+    upload_reset_date = Column(DateTime, default=datetime.utcnow)
+    # Custom User-Level Feature Overrides (Add-ons)
+    custom_features = Column(sqlalchemy.JSON, default={}) # Stores { feature_id: bool }
+    analyses = relationship("Analysis", back_populates="owner", cascade="all, delete-orphan")
+    payments = relationship("Payment", back_populates="user", cascade="all, delete-orphan")
+class Analysis(Base):
+    __tablename__ = "analyses"
+    id = Column(Integer, primary_key=True, index=True)
+    user_id = Column(Integer, ForeignKey("users.id"))
+    timestamp = Column(DateTime, default=datetime.utcnow)
+    company_name = Column(String)
+    input_filename = Column(String)
+    stored_filename = Column(String) # Path to saved file on disk
+    result_json = Column(Text)
+    owner = relationship("User", back_populates="analyses")
+class Payment(Base):
+    __tablename__ = "payments"
+    id = Column(Integer, primary_key=True, index=True)
+    user_id = Column(Integer, ForeignKey("users.id"))
+    amount = Column(Integer) # In cents or dollars? Let's assume dollars as float or integer cents. Implementation plan said float, but explicit Integer is safer for cents. Let's stick to Float for simplicity with display, or String. Plan said 'amount (float)'. Let's use Float.
+    status = Column(String) # paid, pending, overdue
+    date = Column(DateTime, default=datetime.utcnow)
+    plan_name = Column(String)
+    invoice_pdf = Column(String, nullable=True) # Path to invoice file
+    user = relationship("User", back_populates="payments")

app/schemas/chat.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from pydantic import BaseModel
+from typing import List, Optional
+class Message(BaseModel):
+    role: str  # "user" or "assistant"
+    content: str
+class ChatRequest(BaseModel):
+    messages: List[Message]
+    context_filter: Optional[str] = None # e.g. "Balance Sheet", "Risk Report"
+class ChatResponse(BaseModel):
+    response: str
+    sources: List[str] = [] # Citations or references to specific data points

app/schemas/financial.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import sys
+from pathlib import Path
+# Dynamic Path Resolution for 'financial_model' library
+# Structure: root/visique/backend/app/schemas/financial.py -> root/financial_model
+# To import 'financial_model' as a package, we need to add 'root' to sys.path
+try:
+    current_file = Path(__file__).resolve()
+    # Go up 4 levels to 'visique' (backend/app/schemas/financial.py -> schemas -> app -> backend -> visique)
+    # Then up 1 more to root?
+    # current_file.parents[0] = schemas
+    # current_file.parents[1] = app
+    # current_file.parents[2] = backend
+    # current_file.parents[3] = visique
+    # current_file.parents[4] = root (TestAntigrav)
+    project_root = current_file.parents[4]
+    # Check if 'financial_model' exists in this root
+    if (project_root / "financial_model").exists():
+        if str(project_root) not in sys.path:
+            sys.path.insert(0, str(project_root))
+    else:
+        # Fallback for different execution contexts
+        cwd = Path.cwd()
+        if (cwd / "financial_model").exists():
+             if str(cwd) not in sys.path: sys.path.insert(0, str(cwd))
+        elif (cwd.parent.parent / "financial_model").exists():
+             unique_root = str(cwd.parent.parent)
+             if unique_root not in sys.path: sys.path.insert(0, unique_root)
+except Exception as e:
+    pass # Handle gracefully
+try:
+    # Now import from the PACKAGE "financial_model"
+    from financial_model.models import (
+        PeriodType, Currency,
+        IncomeStatementStandard, BalanceSheetStandard, CashFlowStandard, OperatingMetrics,
+        DocumentClassification,
+        FinancialReport, KPIMetrics, BudgetModel, VarianceAnalysis, RiskAnalysis,
+        HealthScoreBreakdown, GeoAnalysis, RunwayForecast, OptimizationInsight,
+        StandardizedDataPackage, VisiVeritasReport
+    )
+except ImportError:
+    print("WARNING: Could not import from financial_model library. Ensure project root is in PYTHONPATH.")
+    raise

app/schemas/user.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from pydantic import BaseModel, EmailStr
+from typing import Optional, List
+from datetime import datetime
+class UserBase(BaseModel):
+    email: str # was EmailStr
+class UserCreate(UserBase):
+    password: str
+    full_name: Optional[str] = None
+    company_name: Optional[str] = None
+    admin_key: Optional[str] = None
+class UserLogin(UserBase):
+    password: str
+class UserResponse(UserBase):
+    id: int
+    full_name: Optional[str] = None
+    company_name: Optional[str] = None
+    plan: str = "Free"
+    plan_expires_at: Optional[datetime] = None
+    is_admin: bool = False
+    is_super_admin: bool = False
+    created_at: datetime
+    # New Fields
+    visique_id: Optional[str] = None
+    ein: Optional[str] = None
+    address: Optional[str] = None
+    profile_picture_url: Optional[str] = None
+    industry: Optional[str] = None
+    preferred_engine: Optional[str] = "v1"
+    custom_features: Optional[dict] = None # JSON feature overrides
+    class Config:
+        from_attributes = True
+class Token(BaseModel):
+    access_token: str
+    token_type: str
+class TokenData(BaseModel):
+    email: Optional[str] = None
+class AnalysisBase(BaseModel):
+    company_name: str
+    input_filename: str
+    timestamp: datetime
+    # result_json is heavy, maybe separate detail view
+class AnalysisResponse(AnalysisBase):
+    id: int
+    user_id: int
+    class Config:
+        from_attributes = True
+class UpgradeRequest(BaseModel):
+    plan_name: str
+    amount: float = 0.0
+    card_number: str
+    expiry: str
+    cvv: str
+    # New Checkout Fields
+    address: Optional[str] = None
+    ein: Optional[str] = None
+class PaymentBase(BaseModel):
+    amount: float
+    status: str
+    plan_name: str
+    date: datetime
+class PaymentResponse(PaymentBase):
+    id: int
+    invoice_pdf: Optional[str] = None
+    class Config:
+        from_attributes = True

app/services/__init__.py ADDED Viewed

	@@ -0,0 +1,36 @@

+"""
+Services Layer
+This package contains all business logic for the Visique platform.
+## Module Index
+- `feature_service` - Feature flag resolution and plan management
+- `analysis/` - Financial analysis and calculations
+- `ingestion/` - Data parsing (CSV, PDF)
+- `intelligence/` - AI-powered features (Gemini, RAG)
+- `reporting/` - Report generation (PDF, PPTX)
+## Usage Pattern
+```python
+from app.services.feature_service import get_effective_features, check_upload_limit
+from app.services.analysis.fundamental import FundamentalAnalyzer
+from app.services.intelligence.gemini_service import GeminiService
+```
+## Design Principles
+1. **Stateless**: Services don't hold state between calls
+2. **Testable**: All dependencies injected as parameters
+3. **Single Purpose**: Each module handles one domain
+4. **Error Handling**: Raise specific exceptions, don't swallow errors
+"""
+# Re-export commonly used functions for convenience
+from app.services.feature_service import (
+    get_effective_features,
+    check_upload_limit,
+    increment_upload_count,
+    get_effective_upload_limit,
+)

app/services/analysis/__init__.py ADDED Viewed

	@@ -0,0 +1,54 @@

+"""
+Financial Analysis Services
+This package contains the core financial analysis logic.
+## Module Responsibilities
+| Module | Purpose | Key Functions |
+|--------|---------|---------------|
+| `fundamental.py` | Main orchestrator | `FundamentalAnalyzer.analyze()` |
+| `kpi.py` | KPI calculations | `calculate_margins()`, `calculate_ratios()` |
+| `risk.py` | Risk assessment | `calculate_risk_score()`, `identify_risk_factors()` |
+| `health_score.py` | Overall health | `compute_health_score()` |
+| `growth.py` | Growth metrics | `calculate_growth_rates()` |
+| `simulation.py` | What-if modeling | `simulate_scenario()` |
+## Data Flow
+```
+Raw Data (CSV/PDF)
+       ↓
+   Ingestion Layer (parsed dict)
+       ↓
+   FundamentalAnalyzer.analyze()
+       ├── KPI Calculator
+       ├── Risk Analyzer
+       ├── Health Score
+       ├── Growth Metrics
+       └── (optional) AI Enrichment
+       ↓
+   StandardizedDataPackage
+```
+## Usage
+```python
+from app.services.analysis.fundamental import FundamentalAnalyzer
+analyzer = FundamentalAnalyzer()
+result = await analyzer.analyze(parsed_data, user, filename)
+# result is a StandardizedDataPackage (Pydantic model)
+```
+## Adding New Analysis Modules
+1. Create new file in this directory (e.g., `budget.py`)
+2. Define calculation functions with type hints
+3. Import and call from `FundamentalAnalyzer.analyze()`
+4. Add result to `StandardizedDataPackage` schema
+5. (Optional) Register as feature in `feature_registry.py`
+"""
+# Re-export main analyzer for convenience
+from app.services.analysis.fundamental import FundamentalAnalyzer

app/services/analysis/engine_lite.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import sys
+from pathlib import Path
+from typing import List, Optional, Dict, Any
+from app.schemas.financial import FinancialReport, BudgetModel
+# Ensure path to financial_model
+try:
+    current_file = Path(__file__).resolve()
+    project_root = current_file.parents[4]
+    if (project_root / "financial_model").exists():
+        if str(project_root) not in sys.path:
+            sys.path.insert(0, str(project_root))
+except Exception:
+    pass
+try:
+    from financial_model.core import FinancialAnalyzer
+except ImportError:
+    # Fallback
+    sys.path.insert(0, "../../../../../")
+    from financial_model.core import FinancialAnalyzer
+class LiteAnalyzer:
+    """
+    Visi-Insight-2 (Lite Engine)
+    Optimized for memory-constrained environments.
+    - No External API calls (GeoService removed)
+    - No Heavy Simulation (if added in future)
+    - Pure Mathematical Analysis only
+    """
+    @staticmethod
+    def analyze(report: FinancialReport, budget: Optional[BudgetModel] = None, comparisons: Optional[List[FinancialReport]] = None, user_address: Optional[str] = None, enabled_features: List[str] = []) -> Dict[str, Any]:
+        # Run Pure Math Analysis
+        analyzer = FinancialAnalyzer(report)
+        results = analyzer.run_full_analysis(budget, comparisons, user_address)
+        # Tag result as Lite
+        results['meta'] = {
+            "engine": "Visi-Insight-2 (Lite)",
+            "optimized": True
+        }
+        # Explicitly exclude heavy/external modules like GeoService
+        results['geo_analysis'] = None
+        return results

app/services/analysis/factory.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from app.models.user import User
+from app.services.analysis.fundamental import FundamentalAnalyzer
+from app.services.analysis.engine_lite import LiteAnalyzer
+class AnalysisFactory:
+    @staticmethod
+    def get_analyzer(user: User):
+        """
+        Returns the appropriate analyzer class based on user preference.
+        Defaults to Standard (V1) if not specified.
+        """
+        # Feature Flag / Engine Selection
+        engine_pref = getattr(user, 'preferred_engine', 'v1')
+        if engine_pref == 'v2':
+            return LiteAnalyzer
+        else:
+            return FundamentalAnalyzer

app/services/analysis/fundamental.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import sys
+from pathlib import Path
+from typing import List, Optional, Dict, Any
+# Ensure project root is in path so we can import 'financial_model' package
+try:
+    current_file = Path(__file__).resolve()
+    project_root = current_file.parents[4]
+    if (project_root / "financial_model").exists():
+        if str(project_root) not in sys.path:
+            sys.path.insert(0, str(project_root))
+except Exception:
+    pass
+from app.schemas.financial import (
+    FinancialReport,
+    BudgetModel,
+    StandardizedDataPackage
+)
+# Import Core Logic from Library Package
+try:
+    from financial_model.core import FinancialAnalyzer
+except ImportError:
+    # If path setup failed, try forcing the path
+    sys.path.insert(0, "../../../../../")
+    from financial_model.core import FinancialAnalyzer
+class FundamentalAnalyzer:
+    @staticmethod
+    def analyze(report: FinancialReport, budget: Optional[BudgetModel] = None, comparisons: Optional[List[FinancialReport]] = None, user_address: Optional[str] = None, enabled_features: List[str] = []) -> Dict[str, Any]:
+        """
+        Main entry point for analysis.
+        Delegates core logic to the independent 'financial_model' library.
+        Enhances result with external services (GeoService).
+        """
+        # 1. Run Pure Financial Analysis (Library)
+        analyzer = FinancialAnalyzer(report)
+        results = analyzer.run_full_analysis(budget, comparisons, user_address)
+        # 2. Inject External Services (Geo Intelligence)
+        # This keeps the library pure and the backend handling integration
+        geo_analysis = None
+        analysis_address = None
+        is_own_company = False
+        if hasattr(report, 'company_address') and report.company_address:
+            analysis_address = report.company_address
+            if user_address and user_address.lower().strip() == report.company_address.lower().strip():
+                is_own_company = True
+        elif user_address:
+            analysis_address = user_address
+            is_own_company = True
+        else:
+            analysis_address = f"{report.company_name} Location"
+        if "geo_insights" in enabled_features and analysis_address:
+            try:
+                from app.services.intelligence.geo_service import GeoService
+                geo_analysis = GeoService.analyze_location(
+                    analysis_address,
+                    report.metrics.industry,
+                    is_own_company=is_own_company,
+                    company_name=report.company_name
+                )
+            except ImportError:
+                print("Warning: GeoService not available.")
+            except Exception as e:
+                print(f"Error in GeoService: {e}")
+        if geo_analysis:
+            results['geo_analysis'] = geo_analysis
+        return results

app/services/analysis/growth.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from app.schemas.financial import FinancialReport
+class GrowthAnalyzer:
+    @staticmethod
+    def analyze_growth_potential(report: FinancialReport) -> str:
+        """
+        A modular analyzer that looks for growth signals.
+        """
+        signals = []
+        # In a real model, this would compare current vs previous periods.
+        # Since we only have one period in the standard import, we use heuristics or "Time Series" placeholder logic.
+        income = report.income_statement
+        if income.revenue > 1_000_000:
+            signals.append("High Volume Business: Revenue > $1M suggests established market presence.")
+        if income.operating_income and income.revenue:
+            if (income.operating_income / income.revenue) > 0.20:
+                signals.append("Scalable Model: Operating margins > 20% indicate high growth potential.")
+        if not signals:
+            return "Growth Potential: Stable / Needs more historical data."
+        return "Growth Potential: " + " ".join(signals)

app/services/analysis/health_score.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from app.schemas.financial import KPIMetrics, HealthScoreBreakdown
+class HealthScoreAnalyzer:
+    @staticmethod
+    def calculate(metrics: KPIMetrics) -> HealthScoreBreakdown:
+        # 1. Stability (Liquidity/Debt) - Max 25
+        stability = 0
+        if metrics.current_ratio:
+            if metrics.current_ratio > 1.5: stability += 15
+            elif metrics.current_ratio > 1.0: stability += 10
+        if metrics.debt_to_equity:
+            if metrics.debt_to_equity < 1.0: stability += 10
+            elif metrics.debt_to_equity < 2.0: stability += 5
+        else:
+             # Assume acceptable if no debt info
+             stability += 10
+        # 2. Profitability (Margins) - Max 35
+        profitability = 0
+        if metrics.net_margin:
+            if metrics.net_margin > 15: profitability += 15
+            elif metrics.net_margin > 5: profitability += 10
+            elif metrics.net_margin > 0: profitability += 5
+        if metrics.gross_margin:
+            if metrics.gross_margin > 40: profitability += 10
+            elif metrics.gross_margin > 20: profitability += 5
+        if metrics.roe:
+             if metrics.roe > 15: profitability += 10
+        # 3. Growth (Placeholder / Revenue Trajectory) - Max 20
+        # In single snapshot, we check generic health markers
+        growth = 10 # Baseline
+        # 4. Efficiency - Max 20
+        efficiency = 10 # Baseline
+        if metrics.dso and metrics.dso < 45: efficiency += 10
+        total = min(100, stability + profitability + growth + efficiency)
+        return HealthScoreBreakdown(
+            stability=stability,
+            profitability=profitability,
+            growth=growth,
+            efficiency=efficiency,
+            total_score=total
+        )

app/services/analysis/kpi.py ADDED Viewed

	@@ -0,0 +1,95 @@

+from app.schemas.financial import IncomeStatementStandard, BalanceSheetStandard, CashFlowStandard, KPIMetrics
+from .registry import KPIRegistry
+class KPIAnalyzer:
+    @staticmethod
+    def initialize_default_kpis():
+        """ Registers the standard Visi-Insight KPIs into the dynamic engine. """
+        # Helper to avoid division by zero
+        safe_div = lambda num, den: (num / den) * 100 if den and den != 0 else 0.0
+        # Profitability
+        KPIRegistry.register(
+            "gross_margin", "Gross Profit Margin (%)", "Profitability",
+            lambda r: safe_div(r.income_statement.gross_profit, r.income_statement.revenue or 1)
+        )
+        KPIRegistry.register(
+            "operating_margin", "Operating Income Margin (%)", "Profitability",
+            lambda r: safe_div(r.income_statement.operating_income, r.income_statement.revenue or 1)
+        )
+        KPIRegistry.register(
+            "net_margin", "Net Profit Margin (%)", "Profitability",
+            lambda r: safe_div(r.income_statement.net_income, r.income_statement.revenue or 1)
+        )
+        # Liquidity
+        KPIRegistry.register(
+            "current_ratio", "Current Ratio", "Liquidity",
+            lambda r: r.balance_sheet.total_current_assets / (r.balance_sheet.total_current_liabilities or 1)
+        )
+        # Solvency
+        KPIRegistry.register(
+            "debt_to_equity", "Debt to Equity Ratio", "Solvency",
+            lambda r: r.balance_sheet.total_liabilities / (r.balance_sheet.total_equity or 1) if r.balance_sheet.total_liabilities else 0.0
+        )
+        KPIRegistry.register(
+            "roe", "Return on Equity (%)", "Solvency",
+            lambda r: safe_div(r.income_statement.net_income, r.balance_sheet.total_equity or 1)
+        )
+        # Efficiency
+        KPIRegistry.register(
+            "dso", "Days Sales Outstanding", "Efficiency",
+            lambda r: r.balance_sheet.accounts_receivable / (r.income_statement.revenue / 365) if r.income_statement.revenue and r.income_statement.revenue > 0 and r.balance_sheet.accounts_receivable else 0.0
+        )
+        # Specific
+        KPIRegistry.register(
+            "prime_cost", "Prime Cost (%)", "Service/Restaurant",
+            lambda r: safe_div(r.income_statement.cogs + r.income_statement.payroll_expenses, r.income_statement.revenue or 1)
+        )
+    @staticmethod
+    def calculate_metrics(report: 'FinancialReport') -> KPIMetrics:
+        # 1. Ensure the default registry is loaded
+        if not KPIRegistry.get_all_kpis():
+             KPIAnalyzer.initialize_default_kpis()
+        # 2. Evaluate all dynamic registry KPIs against the current report object
+        # The dynamic engine returns a dict of standard KPI names -> float values
+        dynamic_results = KPIRegistry.evaluate_all(report)
+        # 3. Apply standard backwards-compatible assignments on the Pydantic schema
+        metrics = KPIMetrics()
+        metrics.gross_margin = dynamic_results.pop("gross_margin", 0.0)
+        metrics.operating_margin = dynamic_results.pop("operating_margin", 0.0)
+        metrics.net_margin = dynamic_results.pop("net_margin", 0.0)
+        metrics.current_ratio = dynamic_results.pop("current_ratio", 0.0)
+        metrics.debt_to_equity = dynamic_results.pop("debt_to_equity", 0.0)
+        metrics.roe = dynamic_results.pop("roe", 0.0)
+        metrics.dso = dynamic_results.pop("dso", 0.0)
+        metrics.prime_cost = dynamic_results.pop("prime_cost", 0.0)
+        # All remaining dynamically registered KPIs go into the custom_metrics dict
+        metrics.custom_metrics = dynamic_results
+        # Extracted or Calculated Extra Metrics (Metadata)
+        if "extracted_restaurant_margin" in report.metadata:
+            try:
+                metrics.restaurant_margin = float(report.metadata["extracted_restaurant_margin"])
+            except:
+                pass
+        if "extracted_effective_tax_rate" in report.metadata:
+             try:
+                metrics.effective_tax_rate = float(report.metadata["extracted_effective_tax_rate"])
+             except:
+                pass
+        elif report.income_statement.taxes > 0 and report.income_statement.net_income > 0:
+            pre_tax = report.income_statement.net_income + report.income_statement.taxes
+            metrics.effective_tax_rate = (report.income_statement.taxes / pre_tax) * 100
+        return metrics

app/services/analysis/registry.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from typing import Callable, Dict, Any, Optional
+class KPIRegistry:
+    """
+    Dynamic KPI Registry for Visi-Insight Phase 3.
+    This allows us to untether KPI math from hardcoded backend logic,
+    creating a plug-and-play architecture where formulas can be defined,
+    validated, and executed on the fly using a central registry.
+    """
+    _registry: Dict[str, Dict[str, Any]] = {}
+    @classmethod
+    def register(cls, name: str, description: str, category: str, formula: Callable[[Any], float]):
+        """
+        Registers a new KPI formula.
+        :param name: Unique identifier for the KPI (e.g. 'gross_margin')
+        :param description: Human-readable description
+        :param category: Category logic group (e.g. 'profitability')
+        :param formula: Lambda or function taking the `FinancialReport` object and returning a float
+        """
+        cls._registry[name] = {
+            "description": description,
+            "category": category,
+            "formula": formula
+        }
+    @classmethod
+    def get_formula(cls, name: str) -> Optional[Callable]:
+        return cls._registry.get(name, {}).get("formula")
+    @classmethod
+    def get_all_kpis(cls) -> Dict[str, Dict[str, Any]]:
+        return cls._registry
+    @classmethod
+    def evaluate(cls, name: str, report: Any) -> float:
+        """
+        Evaluates a single registered KPI formula safely against a report.
+        """
+        formula = cls.get_formula(name)
+        if not formula:
+            raise KeyError(f"KPI '{name}' is not registered in the Dynamic KPI Engine.")
+        try:
+            return formula(report)
+        except Exception as e:
+            # Handle math errors like division by zero gracefully
+            return 0.0
+    @classmethod
+    def evaluate_all(cls, report: Any) -> Dict[str, float]:
+        """
+        Evaluates all registered KPIs for a given report.
+        Returns a flat dictionary mapping the KPI name to its calculated value.
+        """
+        results = {}
+        for kpi_name in cls._registry.keys():
+            results[kpi_name] = cls.evaluate(kpi_name, report)
+        return results
+    @classmethod
+    def clear(cls):
+        """ Clears all loaded KPIs from the registry. Mostly for testing or hot-reloading. """
+        cls._registry = {}

app/services/analysis/risk.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from typing import List
+from app.schemas.financial import KPIMetrics, RiskAnalysis
+class RiskAnalyzer:
+    @staticmethod
+    def analyze(metrics: KPIMetrics, balance_cash: float = 0.0, monthly_burn: float = 0.0) -> RiskAnalysis:
+        score = 100.0
+        factors = []
+        liquidity = "Low Risk" # Default assumes good
+        solvency = "Low Risk"
+        # 1. Liquidity Risk (Current Ratio)
+        if metrics.current_ratio:
+            if metrics.current_ratio < 1.0:
+                score -= 20
+                factors.append("Critical: Current Ratio < 1.0 (Liquidity Issue)")
+                liquidity = "Critical"
+            elif metrics.current_ratio < 1.5:
+                score -= 10
+                factors.append("Warning: Current Ratio < 1.5")
+                liquidity = "Medium"
+        else:
+            factors.append("Unknown: Missing Current Ratio data")
+        # 2. Solvency Risk (Debt to Equity)
+        if metrics.debt_to_equity:
+            if metrics.debt_to_equity > 2.0:
+                score -= 15
+                factors.append("High Leverage: Debt/Equity > 2.0")
+                solvency = "High Risk"
+            elif metrics.debt_to_equity > 1.0:
+                 solvency = "Medium Risk"
+        # 3. Profitability Risk
+        if metrics.net_margin and metrics.net_margin < 0:
+            score -= 25
+            factors.append("Loss Making: Negative Net Margin")
+        # 4. Burn Rate (Runway)
+        runway_months = None
+        if monthly_burn > 0:
+            runway_months = balance_cash / monthly_burn
+            if runway_months < 3:
+                score -= 25
+                factors.append(f"CRITICAL: Low Cash Runway ({runway_months:.1f} months)")
+                liquidity = "Critical"
+            elif runway_months < 6:
+                score -= 10
+                factors.append(f"Warning: Cash Runway < 6 months ({runway_months:.1f} months)")
+        return RiskAnalysis(
+            risk_score=max(0.0, score),
+            risk_factors=factors,
+            liquidity_risk=liquidity,
+            solvency_risk=solvency,
+            burn_rate_months=runway_months
+        )

app/services/analysis/simulation.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from app.schemas.financial import FinancialReport, StandardizedDataPackage, KPIMetrics, RiskAnalysis, IncomeStatementStandard
+from app.services.analysis.kpi import KPIAnalyzer
+from app.services.analysis.risk import RiskAnalyzer
+from app.services.analysis.health_score import HealthScoreAnalyzer
+from app.services.analysis.fundamental import FundamentalAnalyzer
+import copy
+class SimulationService:
+    @staticmethod
+    def run_simulation(
+        original_data: FinancialReport,
+        delta_revenue_percent: float = 0.0,
+        delta_cogs_percent: float = 0.0,
+        delta_payroll_percent: float = 0.0,
+        delta_marketing_percent: float = 0.0,
+        delta_fixed_costs_percent: float = 0.0
+    ) -> StandardizedDataPackage:
+        """
+        Runs a What-If scenario on the financial data.
+        Delta percentages are passed as floats (e.g., 10.0 for +10%).
+        """
+        # Deep copy to avoid mutating original
+        simulated_report = copy.deepcopy(original_data)
+        income = simulated_report.income_statement
+        # Apply deltas
+        if delta_revenue_percent != 0:
+            income.revenue *= (1 + delta_revenue_percent / 100)
+        if delta_cogs_percent != 0:
+            income.cogs *= (1 + delta_cogs_percent / 100)
+        if delta_payroll_percent != 0:
+            income.payroll_expenses *= (1 + delta_payroll_percent / 100)
+        if delta_marketing_percent != 0:
+             income.marketing_expenses *= (1 + delta_marketing_percent / 100)
+        if delta_fixed_costs_percent != 0:
+             income.rent_expense *= (1 + delta_fixed_costs_percent / 100)
+             income.other_operating_expenses *= (1 + delta_fixed_costs_percent / 100)
+        # Re-calculate dependent fields
+        # Note: In a real complex model, variable costs might scale with revenue automatically.
+        # Here we assume structure stays static unless explicitly modified.
+        # Re-run Full Analysis (Phase 3 Update)
+        # Instead of calling individual analyzers, call the main FundamentalAnalyzer
+        # This ensures simulated data gets Runway, Optimization, etc.
+        full_analysis = FundamentalAnalyzer.analyze(simulated_report)
+        # Override insights to show what changed
+        sim_summary = f"Simulation: Rev {delta_revenue_percent:+.0f}%, COGS {delta_cogs_percent:+.0f}%, Mkt {delta_marketing_percent:+.0f}%, Fixed {delta_fixed_costs_percent:+.0f}%"
+        full_analysis['insights'].insert(0, sim_summary)
+        return StandardizedDataPackage(
+            raw_data=simulated_report,
+            kpis=full_analysis['kpis'],
+            risk_analysis=full_analysis['risk_analysis'],
+            health_score=full_analysis['health_score'],
+            insights=full_analysis['insights'],
+            recommendations=full_analysis['recommendations'],
+            runway_forecast=full_analysis['runway_forecast'],
+            optimization_insights=full_analysis['optimization_insights']
+        )

app/services/feature_service.py ADDED Viewed

	@@ -0,0 +1,306 @@

+"""
+Feature Service - Business logic for feature flag management.
+Handles the resolution of feature availability considering:
+1. Admin overrides (from database)
+2. Plan defaults (from plan_config.py)
+3. Feature registry validation
+"""
+from typing import List, Dict, Optional, Any
+from sqlalchemy.orm import Session
+from datetime import datetime, timedelta
+from app.core.feature_registry import (
+    get_all_features,
+    get_feature_by_id,
+    get_all_feature_ids,
+    get_features_by_category,
+    Feature
+)
+from app.core.plan_config import (
+    get_default_features,
+    get_upload_limit as get_default_upload_limit,
+    get_all_plans,
+    get_all_engines,
+    PLAN_DEFAULTS
+)
+from app.models.feature_flags import PlanFeatureOverride, PlanUploadLimit
+from app.models.user import User
+def get_effective_features(db: Session, plan_name: str) -> List[str]:
+    """
+    Get the list of enabled feature IDs for a plan,
+    considering admin overrides.
+    Resolution order:
+    1. Start with plan defaults from plan_config.py
+    2. Apply any overrides from database
+    """
+    # Get default features for plan
+    default_features = set(get_default_features(plan_name))
+    # Get all overrides for this plan
+    overrides = db.query(PlanFeatureOverride).filter(
+        PlanFeatureOverride.plan_name == plan_name
+    ).all()
+    # Apply overrides
+    for override in overrides:
+        if override.enabled:
+            default_features.add(override.feature_id)
+        else:
+            default_features.discard(override.feature_id)
+    return list(default_features)
+def is_feature_enabled(db: Session, plan_name: str, feature_id: str) -> bool:
+    """Check if a specific feature is enabled for a plan."""
+    enabled_features = get_effective_features(db, plan_name)
+    return feature_id in enabled_features
+def resolve_user_features(db: Session, user: User) -> List[str]:
+    """
+    Resolve final feature flags for a user, combining:
+    1. Plan Entitlements (Base)
+    2. User-Specific Overrides (Add-ons/Removals) -> stored in user.custom_features
+    3. Engine Constraints (Hard Limit)
+    Returns: List of enabled feature IDs.
+    """
+    # 1. Base Plan Features
+    current_plan = user.plan or "Free"
+    if user.is_admin:
+        current_plan = "Admin"
+    plan_features = set(get_effective_features(db, current_plan))
+    # 2. Apply User Custom Overrides (Add-ons / Removals)
+    # user.custom_features is a JSON dict { "feature_id": bool }
+    # Ensure it's a dict (SQLAlchemy JSON might return None if default not applied yet)
+    custom_map = user.custom_features or {}
+    if isinstance(custom_map, str):
+        # Handle case with SQLite where it might be stored as string
+        import json
+        try:
+            custom_map = json.loads(custom_map)
+        except:
+            custom_map = {}
+    for fid, enabled in custom_map.items():
+        if enabled:
+            plan_features.add(fid)
+        elif fid in plan_features:
+            plan_features.remove(fid)
+    # 3. Apply Engine Constraints (Hardware Limits)
+    # Default to v1 if not set
+    engine_pref = getattr(user, "preferred_engine", "v1") or "v1"
+    engine_key = f"_ENGINE_{engine_pref}"
+    # Get engine allowed features
+    engine_features = set(get_effective_features(db, engine_key))
+    # Final Result = (Plan U Custom) INTERSECT Engine
+    return list(plan_features.intersection(engine_features))
+def get_effective_upload_limit(db: Session, plan_name: str) -> int:
+    """
+    Get the upload limit for a plan, considering admin overrides.
+    """
+    # Check for override
+    override = db.query(PlanUploadLimit).filter(
+        PlanUploadLimit.plan_name == plan_name
+    ).first()
+    if override:
+        return override.upload_limit
+    return get_default_upload_limit(plan_name)
+def get_all_plan_features(db: Session) -> Dict[str, Dict[str, Any]]:
+    """
+    Get feature configuration for all plans.
+    Returns a dict with plan names as keys and feature configs as values.
+    """
+    all_feature_ids = get_all_feature_ids()
+    result = {}
+    for plan_name in get_all_plans():
+        enabled_features = get_effective_features(db, plan_name)
+        upload_limit = get_effective_upload_limit(db, plan_name)
+        result[plan_name] = {
+            "upload_limit": upload_limit,
+            "features": {
+                fid: fid in enabled_features
+                for fid in all_feature_ids
+            }
+        }
+    return result
+def get_feature_matrix(db: Session) -> Dict[str, Any]:
+    """
+    Get feature matrix for admin console display.
+    Includes categories, features, and per-plan enablement.
+    """
+    categories = get_features_by_category()
+    plans = get_all_plans()
+    engines = get_all_engines()
+    # Build matrix
+    matrix = {}
+    for cat_name, features in categories.items():
+        matrix[cat_name] = []
+        for feature in features:
+            row = {
+                "id": feature.id,
+                "name": feature.name,
+                "description": feature.description,
+                "memory_cost_mb": getattr(feature, "memory_cost_mb", 0),
+                "plans": {},
+                "engines": {}
+            }
+            for plan in plans:
+                row["plans"][plan] = is_feature_enabled(db, plan, feature.id)
+            for engine in engines:
+                row["engines"][engine] = is_feature_enabled(db, engine, feature.id)
+            matrix[cat_name].append(row)
+    return {
+        "categories": list(categories.keys()),
+        "plans": plans,
+        "engines": engines,
+        "matrix": matrix
+    }
+def set_feature_override(
+    db: Session,
+    plan_name: str,
+    feature_id: str,
+    enabled: bool,
+    admin_id: Optional[int] = None
+) -> PlanFeatureOverride:
+    """
+    Set or update a feature override for a plan.
+    """
+    # Validate feature exists
+    if not get_feature_by_id(feature_id):
+        raise ValueError(f"Unknown feature ID: {feature_id}")
+    # Find or create override
+    override = db.query(PlanFeatureOverride).filter(
+        PlanFeatureOverride.plan_name == plan_name,
+        PlanFeatureOverride.feature_id == feature_id
+    ).first()
+    if override:
+        override.enabled = enabled
+        override.updated_by_id = admin_id
+    else:
+        override = PlanFeatureOverride(
+            plan_name=plan_name,
+            feature_id=feature_id,
+            enabled=enabled,
+            updated_by_id=admin_id
+        )
+        db.add(override)
+    db.commit()
+    db.refresh(override)
+    return override
+def bulk_set_features(
+    db: Session,
+    plan_name: str,
+    feature_states: Dict[str, bool],
+    admin_id: Optional[int] = None
+) -> int:
+    """
+    Bulk update feature states for a plan.
+    Returns count of updated features.
+    """
+    count = 0
+    for feature_id, enabled in feature_states.items():
+        set_feature_override(db, plan_name, feature_id, enabled, admin_id)
+        count += 1
+    return count
+def reset_plan_to_defaults(db: Session, plan_name: str) -> int:
+    """
+    Remove all overrides for a plan, reverting to defaults.
+    Returns count of deleted overrides.
+    """
+    result = db.query(PlanFeatureOverride).filter(
+        PlanFeatureOverride.plan_name == plan_name
+    ).delete()
+    db.commit()
+    return result
+def check_upload_limit(db: Session, user: User) -> Dict[str, Any]:
+    """
+    Check if user can upload, considering their plan limit.
+    Also handles monthly reset.
+    Returns:
+        {
+            "can_upload": bool,
+            "uploads_used": int,
+            "uploads_limit": int,
+            "uploads_remaining": int,
+            "reset_date": datetime
+        }
+    """
+    # Check if we need to reset monthly count
+    now = datetime.utcnow()
+    if user.upload_reset_date:
+        days_since_reset = (now - user.upload_reset_date).days
+        if days_since_reset >= 30:
+            user.monthly_upload_count = 0
+            user.upload_reset_date = now
+            db.commit()
+    else:
+        user.upload_reset_date = now
+        db.commit()
+    # Get effective limit
+    plan = user.plan or "Individual"
+    if user.is_admin:
+        plan = "Admin"
+    limit = get_effective_upload_limit(db, plan)
+    used = user.monthly_upload_count or 0
+    remaining = max(0, limit - used)
+    # Calculate next reset
+    next_reset = user.upload_reset_date + timedelta(days=30) if user.upload_reset_date else now + timedelta(days=30)
+    return {
+        "can_upload": used < limit,
+        "uploads_used": used,
+        "uploads_limit": limit,
+        "uploads_remaining": remaining,
+        "reset_date": next_reset.isoformat()
+    }
+def increment_upload_count(db: Session, user: User) -> int:
+    """
+    Increment user's upload count. Call after successful upload.
+    Returns new count.
+    """
+    user.monthly_upload_count = (user.monthly_upload_count or 0) + 1
+    db.commit()
+    return user.monthly_upload_count

app/services/ingestion/__init__.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""
+Ingestion Layer - File parsing and data extraction.
+This package handles parsing of various financial document formats
+and standardizing them into a common FinancialReport schema.
+## Supported Formats
+| Format | Parser | Description |
+|--------|--------|-------------|
+| CSV | CSVParser | Comma-separated financial data |
+| PDF | HybridPDFParser | Dolphin-v2 + pdfplumber hybrid extraction |
+| PDF | PDFParser | Legacy pdfplumber-only parser |
+| XLSX/XLS | XLSXParser | Excel workbooks |
+## PDF Hybrid Architecture
+PDF files are processed by both Dolphin-v2 and pdfplumber:
+1. Dolphin: layout analysis, document classification, element extraction
+2. pdfplumber: gap-filling table + regex extraction
+3. Merge: Dolphin fields take priority, pdfplumber fills gaps
+If Dolphin is not installed, falls back to pdfplumber-only automatically.
+## Usage
+Use UnifiedParser for automatic format detection:
+```python
+from app.services.ingestion import UnifiedParser
+report = UnifiedParser.parse(file_path, original_filename)
+```
+Or use specific parsers directly:
+```python
+from app.services.ingestion import CSVParser, HybridPDFParser, XLSXParser
+report = CSVParser.parse(file_path)
+report = HybridPDFParser.parse(file_path)  # Dolphin + pdfplumber
+report = XLSXParser.parse(file_path)
+```
+## Adding New Formats
+1. Create `parser_xxx.py` with a class implementing `parse(file_path) -> FinancialReport`
+2. Register in `unified_parser.py` SUPPORTED_EXTENSIONS dict
+3. Add import in this `__init__.py`
+"""
+from app.services.ingestion.unified_parser import UnifiedParser
+from app.services.ingestion.parser_csv import CSVParser
+from app.services.ingestion.parser_pdf import PDFParser
+from app.services.ingestion.parser_dolphin import HybridPDFParser
+from app.services.ingestion.parser_xlsx import XLSXParser
+from app.services.ingestion.mappings import DataMapper

app/services/ingestion/doc_keywords.py ADDED Viewed

	@@ -0,0 +1,1408 @@

+"""
+Document Keyword Registry — 53 Financial Document Types
+========================================================
+Central registry mapping document types to their identifying keywords and
+extractable field variables. Used by the classifier to identify uploaded
+documents and guide targeted extraction.
+Generated from the doc-keywords-mapped reference data.
+Learned keywords from admin training are loaded from learned_keywords.json
+and merged at startup.
+"""
+import json
+import os
+import logging
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional
+logger = logging.getLogger(__name__)
+@dataclass
+class DocTypeDefinition:
+    """Definition of a single financial document type."""
+    id: int
+    display_name: str
+    keywords: List[str]           # Keywords used for classification (case-insensitive)
+    fields: List[str]             # Extractable FIELD: variables
+    category: str = "general"     # Category grouping for UI display
+    min_keyword_matches: int = 2  # Minimum keyword hits before scoring
+# ============================================================================
+# 53 DOCUMENT TYPE DEFINITIONS
+# ============================================================================
+DOC_TYPE_REGISTRY: Dict[str, DocTypeDefinition] = {
+    # ── 1. SOC 2 Audit Report ───────────────────────────────────────────
+    "soc_2_audit": DocTypeDefinition(
+        id=1,
+        display_name="SOC 2 Audit Report",
+        category="compliance",
+        keywords=[
+            "INDEPENDENT SERVICE AUDITOR'S REPORT", "SOC 2 TYPE",
+            "SYSTEM DESCRIPTION", "TRUST SERVICES CRITERIA",
+            "RELEVANT TO SECURITY", "AICPA",
+            "ASSERTION OF MANAGEMENT", "SECTION III", "SECTION IV",
+            "SOC 2", "SYSTEM AND ORGANIZATION CONTROLS 2",
+            "TRUST SERVICES PRINCIPLES", "SECURITY", "AVAILABILITY",
+            "PROCESSING INTEGRITY", "CONFIDENTIALITY", "PRIVACY",
+            "CONTROL ACTIVITIES", "CONTROL ENVIRONMENT",
+            "MONITORING CONTROLS", "opinion", "auditor's conclusion",
+            "basis for opinion", "independent auditor's report",
+            "qualified/unqualified opinion",
+            "opinion on the fairness of the presentation",
+            "period covered", "review period", "examination period",
+            "service organization", "company name", "organization",
+            "exceptions noted", "deviations", "testing exceptions",
+            "no exceptions noted", "control exceptions", "deficiencies",
+            "weaknesses",
+        ],
+        fields=[
+            "auditor_opinion", "period_covered", "service_organization",
+            "control_exceptions", "criteria",
+        ],
+    ),
+    # ── 2. ARR/MRR Waterfall ────────────────────────────────────────────
+    "arr_mrr_waterfall": DocTypeDefinition(
+        id=2,
+        display_name="ARR/MRR Waterfall",
+        category="saas_metrics",
+        keywords=[
+            "ARR WATERFALL", "MRR ROLLFORWARD",
+            "RECURRING REVENUE BRIDGE", "SAAS METRICS",
+            "ANNUALIZED RECURRING REVENUE", "CARR",
+            "RECURRING REVENUE MOVEMENT", "NET NEW ARR",
+            "RECURRING REVENUE WATERFALL", "RECURRING REVENUE ROLLFORWARD",
+            "ARR BRIDGE", "MRR BRIDGE",
+            "beginning arr", "opening arr", "new logos", "expansion",
+            "contraction", "churn", "cancellation", "ending arr",
+            "closing arr", "exit arr", "end of period arr", "eop arr",
+            "new sales", "upsell", "cross-sell", "price increase",
+            "downgrade", "shrinkage", "downsell", "reduction",
+            "gross churn", "logo churn", "lost arr",
+            "new logo arr", "gross new arr", "new business",
+            "new customers", "reactivation",
+        ],
+        fields=[
+            "beginning_arr", "new_logo_arr", "expansion_arr",
+            "contraction_arr", "churn_arr", "ending_arr",
+        ],
+    ),
+    # ── 3. Deferred Revenue Schedule (ASC 606) ──────────────────────────
+    "deferred_revenue_schedule": DocTypeDefinition(
+        id=3,
+        display_name="Deferred Revenue Schedule (ASC 606)",
+        category="revenue_recognition",
+        keywords=[
+            "DEFERRED REVENUE ROLLFORWARD", "CONTRACT LIABILITY SCHEDULE",
+            "UNEARNED REVENUE ANALYSIS", "REVENUE RECOGNITION SCHEDULE",
+            "ASC 606 DISCLOSURE", "ASC 606", "IFRS 15",
+            "SHORT-TERM DEFERRED REVENUE", "LONG-TERM DEFERRED REVENUE",
+            "REMAINING PERFORMANCE OBLIGATION",
+            "beginning balance", "beginning deferred revenue",
+            "opening contract liability", "balance at beginning",
+            "billings", "invoiced", "new contracts", "fees billed",
+            "revenue recognized", "earned revenue",
+            "ending balance", "ending deferred revenue",
+            "closing contract liability", "balance at end",
+            "satisfaction of performance obligation",
+            "transfer to revenue", "amortization",
+            "contract with customer liability",
+        ],
+        fields=[
+            "beginning_balance", "billings", "revenue_recognized",
+            "ending_balance",
+        ],
+    ),
+    # ── 4. CAC vs. LTV Model ───────────────────────────────────────────
+    "cac_ltv_model": DocTypeDefinition(
+        id=4,
+        display_name="CAC vs. LTV Model",
+        category="saas_metrics",
+        keywords=[
+            "UNIT ECONOMICS", "LTV/CAC ANALYSIS",
+            "CUSTOMER ACQUISITION COST", "COHORT ANALYSIS",
+            "LIFETIME VALUE MODEL", "SAAS UNIT ECONOMICS",
+            "LTV/CAC RATIO", "MAGIC NUMBER",
+            "cac", "customer acquisition cost", "blended cac", "paid cac",
+            "cost per acquisition", "ltv", "lifetime value", "cltv",
+            "customer lifetime value", "payback period",
+            "months to recover", "months to recover cac", "cac payback",
+            "arpu", "average revenue per user",
+            "average revenue per account", "arpa",
+        ],
+        fields=[
+            "cac", "ltv", "ltv_cac_ratio", "payback_period", "arpu",
+        ],
+    ),
+    # ── 5. Booking / Backlog Report ─────────────────────────────────────
+    "booking_backlog": DocTypeDefinition(
+        id=5,
+        display_name="Booking / Backlog Report",
+        category="sales",
+        keywords=[
+            "SALES BACKLOG", "REMAINING PERFORMANCE OBLIGATIONS",
+            "OPEN ORDERS REPORT", "ORDER BOOK",
+            "BOOKINGS BY CUSTOMER", "UNFULFILLED ORDERS",
+            "ORDER BACKLOG", "CONTRACT BACKLOG",
+            "booking amount", "order value", "total contracted value",
+            "remaining obligation", "open balance", "unfilled orders",
+            "remaining performance obligation", "remaining contract value",
+            "backlog", "open amount", "po amount",
+            "booking date", "order date", "signed date",
+            "contract effective date", "po date",
+        ],
+        fields=[
+            "customer_name", "contract_value", "backlog_amount",
+            "booking_date",
+        ],
+    ),
+    # ── 6. PCI DSS Compliance ───────────────────────────────────────────
+    "pci_dss": DocTypeDefinition(
+        id=6,
+        display_name="PCI DSS Compliance",
+        category="compliance",
+        keywords=[
+            "ATTESTATION OF COMPLIANCE", "PCI DSS",
+            "REPORT ON COMPLIANCE", "DATA SECURITY STANDARD",
+            "PAYMENT CARD INDUSTRY", "AOC",
+            "PAYMENT CARD INDUSTRY DATA SECURITY STANDARD",
+            "SELF-ASSESSMENT QUESTIONNAIRE", "SAQ", "QSA",
+            "QUALIFIED SECURITY ASSESSOR", "ASV SCAN",
+            "compliant", "non-compliant", "validation status",
+            "overall compliance status", "in compliance",
+            "merchant level", "service provider level", "merchant tier",
+        ],
+        fields=[
+            "merchant_level", "compliance_status", "assessment_date",
+            "qsa_name",
+        ],
+    ),
+    # ── 7. Sales Tax Nexus ──────────────────────────────────────────────
+    "sales_tax_nexus": DocTypeDefinition(
+        id=7,
+        display_name="Sales Tax Nexus",
+        category="tax",
+        keywords=[
+            "NEXUS STUDY", "ECONOMIC NEXUS", "WAYFAIR ANALYSIS",
+            "PHYSICAL PRESENCE TEST", "WAYFAIR",
+            "STATE TAX EXPOSURE", "NEXUS DETERMINATION",
+            "nexus established", "no nexus", "nexus",
+            "physical nexus", "economic nexus",
+            "registration required", "sales tax registration",
+            "revenue threshold met", "sales threshold",
+            "transaction count met", "threshold exceeded",
+            "estimated exposure", "potential liability", "tax due",
+            "tax at risk", "estimated tax due",
+        ],
+        fields=[
+            "jurisdiction", "threshold_met", "exposure_amount",
+        ],
+    ),
+    # ── 8. Inventory Aging Report ───────────────────────────────────────
+    "inventory_aging": DocTypeDefinition(
+        id=8,
+        display_name="Inventory Aging Report",
+        category="inventory",
+        keywords=[
+            "INVENTORY AGING", "STOCK AGE ANALYSIS",
+            "OBSOLETE INVENTORY", "DEAD STOCK REPORT",
+            "SLOW MOVING REPORT", "INVENTORY VALUATION BY AGE",
+            "item code", "sku", "part number", "material number",
+            "product id", "item id",
+            "0-30", "31-60", "61-90", "90+", "120+",
+            "0 to 30", "30-60 days", "60-90 days", "over 90",
+            "current", "obsolete", "dead stock", "slow moving",
+            "days on hand", "total on hand", "extended cost",
+            "valuation", "total cost", "total inventory value",
+        ],
+        fields=[
+            "item_id", "0_30_days", "31_60_days", "61_90_days",
+            "over_90_days", "total_value",
+        ],
+    ),
+    # ── 9. GMROI Schedule ───────────────────────────────────────────────
+    "gmroi_schedule": DocTypeDefinition(
+        id=9,
+        display_name="GMROI Schedule",
+        category="retail",
+        keywords=[
+            "GMROI", "GROSS MARGIN RETURN ON INVESTMENT",
+            "INVENTORY PERFORMANCE", "MERCHANDISE PERFORMANCE",
+            "gross margin", "gross profit",
+            "average inventory", "avg inventory cost",
+            "average inventory at cost", "average stock",
+            "gmroi", "return on inventory", "index",
+            "gmroi %", "turnover", "turns", "stock turn",
+            "inventory turnover",
+        ],
+        fields=[
+            "gross_margin", "avg_inventory", "gmroi_ratio", "turnover",
+        ],
+    ),
+    # ── 10. Open-to-Buy (OTB) Plan ─────────────────────────────────────
+    "open_to_buy": DocTypeDefinition(
+        id=10,
+        display_name="Open-to-Buy (OTB) Plan",
+        category="retail",
+        keywords=[
+            "OPEN TO BUY", "OTB PLAN", "MERCHANDISE BUDGET",
+            "PURCHASING BUDGET", "RETAIL BUDGET",
+            "STOCK TO SALES", "INVENTORY PLAN",
+            "projected sales", "planned sales", "sales forecast",
+            "forecasted sales", "planned ending inventory",
+            "open to buy at retail", "otb purchases",
+            "receipt plan", "open to buy", "purchase budget",
+            "markdowns", "reductions", "discounts",
+            "promotional deductions", "price adjustments",
+            "eop inventory",
+        ],
+        fields=[
+            "projected_sales", "markdowns", "eop_inventory",
+            "receipts", "gross_sales",
+        ],
+    ),
+    # ── 11. Sales & Use Tax Filings ─────────────────────────────────────
+    "sales_use_tax": DocTypeDefinition(
+        id=11,
+        display_name="Sales & Use Tax Filings",
+        category="tax",
+        keywords=[
+            "SALES AND USE TAX RETURN", "FORM ST-",
+            "MULTISTATE TAX RETURN", "SALES TAX PAYABLE",
+            "TAX REMITTANCE", "GROSS RECEIPTS",
+            "ST-3", "ST-9",
+            "taxable sales", "taxable amount", "taxable receipts",
+            "net taxable", "tax collected", "tax due", "total tax",
+            "amount remitted", "tax payable",
+            "gross sales", "total sales", "gross receipts", "total receipts",
+            "filing period",
+        ],
+        fields=[
+            "jurisdiction", "gross_sales", "taxable_sales",
+            "tax_collected", "tax_due",
+        ],
+    ),
+    # ── 12. UNICAP (Sec 263A) ───────────────────────────────────────────
+    "unicap": DocTypeDefinition(
+        id=12,
+        display_name="UNICAP (Sec 263A)",
+        category="tax",
+        keywords=[
+            "SECTION 263A", "UNICAP CALCULATION",
+            "SIMPLIFIED PRODUCTION METHOD", "SIMPLIFIED RESALE METHOD",
+            "INVENTORY CAPITALIZATION", "ABSORPTION RATIO",
+            "HISTORIC ABSORPTION RATIO",
+            "costs incurred", "additional costs",
+            "capitalizable costs", "capitalized inventory adjustment",
+            "section 263a costs", "unicap adjustment",
+            "absorption ratio", "historic absorption ratio",
+            "allocation ratio", "capitalization rate",
+            "ending inventory", "inventory balance", "total inventory",
+        ],
+        fields=[
+            "costs_incurred", "absorption_ratio", "unicap_adjustment",
+            "ending_inventory",
+        ],
+    ),
+    # ── 13. Customer Concentration Report ──────────────────────────────
+    "customer_concentration": DocTypeDefinition(
+        id=13,
+        display_name="Customer Concentration Report",
+        category="risk",
+        keywords=[
+            "CUSTOMER CONCENTRATION", "REVENUE BY CUSTOMER",
+            "CLIENT EXPOSURE", "TOP 10 CUSTOMERS",
+            "CONSUMER CONCENTRATION REPORT", "SUPPLIER CONCENTRATION REPORT",
+            "PARETO ANALYSIS", "WALLET SHARE",
+            "concentration %", "cumulative %", "running total %",
+            "cumulative share", "share", "percentage of revenue",
+            "% of total", "revenue", "total billed", "ytd revenue",
+            "annual sales", "customer name", "client",
+        ],
+        fields=[
+            "customer_name", "revenue_amount", "percent_total",
+            "cumulative_percent",
+        ],
+    ),
+    # ── 14. 13-Week Cash Flow Forecast ──────────────────────────────────
+    "thirteen_week_cash_flow": DocTypeDefinition(
+        id=14,
+        display_name="13-Week Cash Flow Forecast",
+        category="treasury",
+        keywords=[
+            "13-WEEK CASH FLOW", "TWELVE WEEK CASH",
+            "SHORT TERM LIQUIDITY", "WEEKLY CASH FORECAST",
+            "ROLLING CASH FORECAST", "DIRECT METHOD CASH FLOW",
+            "ROLLING 13 WEEK CASH FLOW", "SHORT TERM LIQUIDITY FORECAST",
+            "13-WEEK ROLLING CASH FLOW",
+            "week ending", "week of", "w/e", "week ended",
+            "cash receipts", "customer payments", "ar receipts",
+            "cash inflows from customers",
+            "collections", "cash outflows", "vendor payments",
+            "ap payments", "operating disbursements",
+            "net cash flow", "net change in cash", "burn",
+            "net increase/(decrease)", "ending cash balance",
+            "closing cash", "cash position", "period-end cash",
+        ],
+        fields=[
+            "week_ending", "collections", "disbursements",
+            "net_cash_flow", "ending_cash",
+        ],
+    ),
+    # ── 15. Rent Roll ───────────────────────────────────────────────────
+    "rent_roll": DocTypeDefinition(
+        id=15,
+        display_name="Rent Roll",
+        category="real_estate",
+        keywords=[
+            "RENT ROLL", "TENANT ROSTER", "LEASE SCHEDULE",
+            "TENANCY SCHEDULE", "PROPERTY RENT ROLL",
+            "tenant", "resident", "lessee", "occupant", "tenant name",
+            "unit", "suite", "space no", "unit number", "apartment number",
+            "lease start", "lease end", "commencement", "expiration",
+            "termination date",
+            "base rent", "monthly rent", "contract rent", "scheduled rent",
+            "market rent", "gross potential rent",
+            "total rental revenue", "base rental revenue",
+            "vacancy", "vacancy loss", "credit loss", "concessions",
+            "effective gross income",
+        ],
+        fields=[
+            "unit_id", "tenant_name", "lease_dates", "base_rent",
+            "sq_ft", "security_deposit",
+        ],
+    ),
+    # ── 16. NOI Statement ───────────────────────────────────────────────
+    "noi_statement": DocTypeDefinition(
+        id=16,
+        display_name="NOI Statement",
+        category="real_estate",
+        keywords=[
+            "NET OPERATING INCOME", "NOI STATEMENT",
+            "PROPERTY OPERATING STATEMENT", "PROPERTY P&L",
+            "INCOME AND EXPENSE STATEMENT",
+            "REAL ESTATE OPERATING STATEMENT",
+            "rental revenue", "rental income", "gross potential rent",
+            "vacancy loss", "effective gross income",
+            "total operating expenses", "property expenses",
+            "operating costs", "operating profit",
+            "net operating income", "noi",
+            "income before debt service",
+            "management fees", "property management", "management expense",
+        ],
+        fields=[
+            "rental_revenue", "vacancy_loss", "operating_expenses",
+            "noi", "management_fees",
+        ],
+    ),
+    # ── 17. Occupancy Tax Schedule ──────────────────────────────────────
+    "occupancy_tax": DocTypeDefinition(
+        id=17,
+        display_name="Occupancy Tax Schedule",
+        category="hospitality",
+        keywords=[
+            "OCCUPANCY TAX RETURN", "TOT RETURN",
+            "TRANSIENT OCCUPANCY TAX", "LODGING TAX", "HOTEL TAX",
+            "ROOM TAX", "SHORT-TERM RENTAL TAX",
+            "room revenue", "lodging revenue",
+            "taxable receipts", "gross rents",
+            "tax due", "total tax", "remittance amount",
+            "tax payable", "non-taxable", "tax-exempt revenue",
+            "permanent residents", "rooms sold",
+            "occupancy percentage", "occupancy %",
+        ],
+        fields=[
+            "room_revenue", "tax_due", "occupancy_rate", "exemptions",
+        ],
+    ),
+    # ── 18. Fair Housing Compliance Audit ───────────────────────────────
+    "fair_housing": DocTypeDefinition(
+        id=18,
+        display_name="Fair Housing Compliance Audit",
+        category="compliance",
+        keywords=[
+            "FAIR HOUSING COMPLIANCE", "ADA AUDIT",
+            "ACCESSIBILITY CHECKLIST", "HUD REVIEW",
+            "EQUAL HOUSING OPPORTUNITY", "UFAS COMPLIANCE", "UFAS",
+            "AMERICANS WITH DISABILITIES ACT", "SECTION 504",
+            "fair housing", "violation", "deficiency", "non-compliance",
+            "finding", "corrective action", "remediation",
+            "action plan", "corrective measures",
+        ],
+        fields=[
+            "violation_status", "remediation_plan", "audit_date",
+            "property_address",
+        ],
+    ),
+    # ── 19. ASC 842 Lease Liability Schedule ────────────────────────────
+    "asc_842_lease": DocTypeDefinition(
+        id=19,
+        display_name="ASC 842 Lease Liability Schedule",
+        category="accounting",
+        keywords=[
+            "LEASE LIABILITY AMORTIZATION", "ROU ASSET ROLLFORWARD",
+            "ASC 842", "IFRS 16", "LEASE OBLIGATION",
+            "RIGHT OF USE ASSET", "OPERATING LEASE LIABILITY",
+            "FINANCE LEASE SCHEDULE",
+            "rou asset", "right of use asset", "operating lease asset",
+            "finance lease asset", "lease liability", "lease obligation",
+            "present value of payments", "present value of lease payments",
+            "future minimum lease payments", "single lease cost",
+            "lease expense", "amortization of rou",
+            "interest on lease liability",
+            "incremental borrowing rate", "ibr",
+            "rate implicit in the lease",
+            "discount rate",
+        ],
+        fields=[
+            "rou_asset", "lease_liability", "discount_rate",
+            "lease_payment", "lease_expense",
+        ],
+    ),
+    # ── 20. Fixed Asset Roll-forward (FAR) ──────────────────────────────
+    "fixed_asset_rollforward": DocTypeDefinition(
+        id=20,
+        display_name="Fixed Asset Roll-forward (FAR)",
+        category="accounting",
+        keywords=[
+            "FIXED ASSET ROLLFORWARD", "PPE SCHEDULE",
+            "DEPRECIATION SCHEDULE", "CHANGES IN CAPITAL ASSETS",
+            "CAPITAL ASSET MOVEMENT", "PLANT AND EQUIPMENT SCHEDULE",
+            "beginning net book value", "beginning nbv",
+            "opening balance", "beginning cost",
+            "additions", "purchases", "acquisitions", "asset acquisitions",
+            "disposals", "retirements", "sales", "write-offs",
+            "depreciation expense", "current depreciation",
+            "provision for depreciation",
+            "net book value", "closing balance", "ending net book value",
+        ],
+        fields=[
+            "asset_class", "beg_book_value", "additions", "disposals",
+            "depreciation_exp",
+        ],
+    ),
+    # ── 21. CapEx Reserve Schedule ──────────────────────────────────────
+    "capex_reserve": DocTypeDefinition(
+        id=21,
+        display_name="CapEx Reserve Schedule",
+        category="real_estate",
+        keywords=[
+            "CAPEX RESERVE", "REPLACEMENT RESERVE",
+            "FF&E RESERVE", "CAPITAL IMPROVEMENT BUDGET",
+            "SINKING FUND", "RESERVE FOR REPLACEMENT",
+            "RESERVE FOR REPLACEMENTS", "CAPITAL RESERVE STUDY",
+            "beginning reserve balance", "reserve balance", "fund balance",
+            "escrow balance", "monthly deposit", "contribution",
+            "scheduled deposit", "transfer to reserve",
+            "draws", "disbursements from reserve",
+            "reimbursements", "reserve releases", "withdrawals",
+        ],
+        fields=[
+            "reserve_balance", "monthly_deposit", "draws",
+            "approved_projects",
+        ],
+    ),
+    # ── 22. Cost of Goods Manufactured (COGM) ──────────────────────────
+    "cogm": DocTypeDefinition(
+        id=22,
+        display_name="Cost of Goods Manufactured (COGM)",
+        category="manufacturing",
+        keywords=[
+            "COST OF GOODS MANUFACTURED", "MANUFACTURING STATEMENT",
+            "PRODUCTION COST SCHEDULE", "MANUFACTURING EXPENSES",
+            "SCHEDULE OF COGM", "STATEMENT OF COST OF GOODS MANUFACTURED",
+            "COGM SCHEDULE",
+            "direct materials", "raw materials consumed",
+            "direct material consumed", "material costs",
+            "direct labor", "manufacturing labor", "touch labor",
+            "direct manufacturing labor",
+            "factory overhead", "manufacturing overhead",
+            "indirect costs", "burden", "manufacturing burden",
+            "beginning work in process", "beginning wip",
+            "ending work in process", "ending wip",
+            "cost of goods manufactured", "cogm", "manufacturing costs",
+        ],
+        fields=[
+            "direct_materials", "direct_labor", "factory_overhead",
+            "opening_wip", "closing_wip", "cost_goods_mfd",
+        ],
+    ),
+    # ── 23. Production Variance Report ──────────────────────────────────
+    "production_variance": DocTypeDefinition(
+        id=23,
+        display_name="Production Variance Report",
+        category="manufacturing",
+        keywords=[
+            "PRODUCTION VARIANCE", "MATERIAL USAGE VARIANCE",
+            "STANDARD COST VARIANCE", "COST VARIANCE ANALYSIS",
+            "PRODUCTION VARIANCE REPORT",
+            "standard cost", "planned cost", "expected cost",
+            "actual cost", "incurred cost",
+            "variance", "difference", "favorable/unfavorable",
+            "price variance", "usage variance", "volume variance",
+            "efficiency variance", "rate variance",
+            "yield variance", "labor efficiency variance",
+            "overhead variance",
+        ],
+        fields=[
+            "item_id", "standard_cost", "actual_cost",
+            "variance_amount", "variance_type",
+        ],
+    ),
+    # ── 24. WIP Inventory Valuation ─────────────────────────────────────
+    "wip_valuation": DocTypeDefinition(
+        id=24,
+        display_name="WIP Inventory Valuation",
+        category="manufacturing",
+        keywords=[
+            "WIP VALUATION", "WORK IN PROCESS INVENTORY",
+            "JOB COST REPORT", "PROJECT COST SUMMARY",
+            "UNBILLED COSTS", "WIP AGING",
+            "CONSTRUCTION-IN-PROGRESS",
+            "job number", "project id", "work order", "job name",
+            "costs to date", "total costs", "cumulative costs",
+            "billed to date", "amounts billed", "progress billings",
+            "wip balance", "net wip", "costs in excess of billings",
+            "unbilled work", "unbilled costs",
+            "% complete", "poc", "completion %", "percentage of completion",
+        ],
+        fields=[
+            "job_id", "costs_to_date", "billing_to_date",
+            "wip_balance", "percent_complete",
+        ],
+    ),
+    # ── 25. Open Purchase Order (PO) Log ────────────────────────────────
+    "open_po_log": DocTypeDefinition(
+        id=25,
+        display_name="Open Purchase Order (PO) Log",
+        category="procurement",
+        keywords=[
+            "OPEN PURCHASE ORDERS", "PO STATUS REPORT",
+            "OUTSTANDING ORDERS", "VENDOR COMMITMENTS",
+            "UNRECEIVED PO", "PURCHASE ORDER LOG",
+            "OPEN PO BY VENDOR",
+            "po number", "purchase order", "po #", "order no",
+            "order date", "purchase order date", "po date",
+            "vendor", "supplier", "manufacturer", "vendor name",
+            "expected delivery", "delivery date", "eta", "due date",
+            "promised date", "open amount", "remaining balance",
+            "outstanding balance", "unreceived amount",
+        ],
+        fields=[
+            "po_number", "vendor", "order_date",
+            "expected_delivery", "open_amount",
+        ],
+    ),
+    # ── 26. OSHA Incident Logs ──────────────────────────────────────────
+    "osha_incidents": DocTypeDefinition(
+        id=26,
+        display_name="OSHA Incident Logs",
+        category="safety",
+        keywords=[
+            "OSHA 300", "LOG OF WORK-RELATED INJURIES",
+            "OSHA FORM 300A", "OSHA 301",
+            "INJURY AND ILLNESS INCIDENT REPORT",
+            "SUMMARY OF WORK-RELATED INJURIES AND ILLNESSES",
+            "case number", "osha case id", "case no",
+            "employee's name", "injured worker", "name",
+            "date of injury", "incident date", "date of onset",
+            "description of injury", "injury type",
+            "nature of injury", "classification",
+            "days away from work", "job transfer",
+            "other recordable", "death",
+        ],
+        fields=[
+            "case_number", "employee_name", "incident_date",
+            "description", "incident_type", "classification",
+        ],
+    ),
+    # ── 27. Environmental Health & Safety (EHS) Logs ────────────────────
+    "ehs_logs": DocTypeDefinition(
+        id=27,
+        display_name="Environmental Health & Safety (EHS) Logs",
+        category="safety",
+        keywords=[
+            "EHS INCIDENT LOG", "ENVIRONMENTAL HEALTH SAFETY",
+            "INCIDENT TRACKER", "SPILL REPORT", "NEAR MISS REPORT",
+            "ENVIRONMENTAL INCIDENT REPORT",
+            "SAFETY OBSERVATION",
+            "incident id", "report number", "incident number",
+            "type", "category", "incident category",
+            "Spill", "Release", "Near Miss",
+            "location", "site", "facility", "area",
+            "severity", "impact level", "severity rating",
+        ],
+        fields=[
+            "incident_id", "location", "severity",
+            "corrective_action",
+        ],
+    ),
+    # ── 28. Health & Safety Inspection Logs ─────────────────────────────
+    "safety_inspections": DocTypeDefinition(
+        id=28,
+        display_name="Health & Safety Inspection Logs",
+        category="safety",
+        keywords=[
+            "SAFETY INSPECTION", "SITE SAFETY AUDIT",
+            "WORKPLACE INSPECTION", "SAFETY CHECKLIST",
+            "JSA (Job Safety Analysis)", "JOB SAFETY ANALYSIS",
+            "INSPECTION REPORT", "HAZARD ASSESSMENT",
+            "inspector", "auditor", "checked by", "inspected by",
+            "inspection date", "audit date", "date of inspection",
+            "hazard", "finding", "deficiency",
+            "risk", "priority", "criticality",
+            "risk level", "risk rating",
+            "status", "open/closed", "compliance", "compliance status",
+        ],
+        fields=[
+            "inspector", "inspection_date", "hazard_identified",
+            "risk_level", "status",
+        ],
+    ),
+    # ── 29. Reserve Report (PV-10) ──────────────────────────────────────
+    "reserve_report": DocTypeDefinition(
+        id=29,
+        display_name="Reserve Report (PV-10)",
+        category="oil_gas",
+        keywords=[
+            "RESERVE REPORT", "PV-10",
+            "PETROLEUM ENGINEERING REPORT",
+            "SEC RESERVES", "STANDARDIZED MEASURE",
+            "proved reserves", "proved developed and undeveloped",
+            "1p reserves", "total proved",
+            "oil (mbbls)", "oil barrels", "crude oil volume",
+            "condensate", "plant products",
+            "natural gas", "gas (mmcf)", "natural gas volume",
+            "future net revenue", "future net cash flows",
+            "discounted net revenue @ 10%", "pv-10",
+            "present value at 10%", "undiscounted cash flow",
+            "discounted future net cash flows",
+            "standardized measure",
+        ],
+        fields=[
+            "proved_reserves", "oil_volume", "gas_volume",
+            "future_net_revenue", "pv10_value",
+        ],
+    ),
+    # ── 30. Joint Interest Billing (JIB) ────────────────────────────────
+    "joint_interest_billing": DocTypeDefinition(
+        id=30,
+        display_name="Joint Interest Billing (JIB)",
+        category="oil_gas",
+        keywords=[
+            "JOINT INTEREST BILLING", "JIB STATEMENT",
+            "JOINT ACCOUNT", "PARTNER BILLING",
+            "OPERATING STATEMENT JOINT VENTURE",
+            "OIL AND GAS JIB",
+            "working interest", "wi %", "decimal interest", "wi",
+            "afe", "authority for expenditure", "afe number",
+            "project id", "lease", "well name", "property", "field",
+            "gross amount", "100% share", "total costs",
+            "your share", "owner's share", "net amount", "amount due",
+            "total joint account",
+        ],
+        fields=[
+            "afe_number", "working_interest", "expense_category",
+            "total_billings", "owner_share",
+        ],
+    ),
+    # ── 31. LOE Statement ───────────────────────────────────────────────
+    "loe_statement": DocTypeDefinition(
+        id=31,
+        display_name="LOE Statement (Lease Operating Expenses)",
+        category="oil_gas",
+        keywords=[
+            "LEASE OPERATING EXPENSES", "LOE STATEMENT",
+            "DIRECT OPERATING EXPENSE", "PRODUCTION COSTS",
+            "WELL EXPENSE", "WELL OPERATING STATEMENT",
+            "pumper labor", "contract labor", "operating labor",
+            "supervision", "chemicals", "treating", "methanol",
+            "chemical treatment", "salt water disposal", "swd",
+            "water disposal", "hauling", "workover",
+            "repairs & maintenance", "r&m", "maintenance",
+            "production costs", "direct operating expense",
+        ],
+        fields=[
+            "lease_name", "labor_cost", "chemicals",
+            "water_disposal", "repairs",
+        ],
+    ),
+    # ── 32. FERC Form 1 ────────────────────────────────────────────────
+    "ferc_form_1": DocTypeDefinition(
+        id=32,
+        display_name="FERC Form 1",
+        category="utilities",
+        keywords=[
+            "FERC FORM 1", "ELECTRIC UTILITY ANNUAL REPORT",
+            "MAJOR ELECTRIC UTILITY",
+            "FEDERAL ENERGY REGULATORY COMMISSION",
+            "FERC ACCOUNTS",
+            "respondent_name", "exact legal name of respondent",
+            "reporting entity", "utility name",
+            "electric plant in service", "plant in service",
+            "utility plant", "electric operating revenues",
+            "total operating revenue", "sales of electricity",
+            "megawatt hours", "mwh sold", "megawatt-hours sold",
+            "energy sales (mwh)",
+        ],
+        fields=[
+            "respondent_name", "year_ended", "plant_in_service",
+            "operating_revenues", "mwh_sold",
+        ],
+    ),
+    # ── 33. ARO Schedule ────────────────────────────────────────────────
+    "aro_schedule": DocTypeDefinition(
+        id=33,
+        display_name="ARO Schedule (Asset Retirement Obligation)",
+        category="accounting",
+        keywords=[
+            "ASSET RETIREMENT OBLIGATION", "ARO LIABILITY",
+            "ACCRETION SCHEDULE", "DECOMMISSIONING PROVISION",
+            "RECLAMATION LIABILITY", "FAS 143", "ASC 410",
+            "beginning aro", "opening aro", "beginning aro liability",
+            "accretion", "accretion expense", "accretion of discount",
+            "interest cost",
+            "new wells", "additions", "new aros",
+            "liabilities incurred", "new obligations",
+            "settlements", "obligations settled",
+            "remediation payments",
+            "ending balance", "closing aro", "ending aro liability",
+        ],
+        fields=[
+            "beginning_liability", "accretion_expense",
+            "new_obligations", "settlements", "ending_liability",
+        ],
+    ),
+    # ── 34. IFTA Fuel Tax Report ────────────────────────────────────────
+    "ifta_fuel_tax": DocTypeDefinition(
+        id=34,
+        display_name="IFTA Fuel Tax Report",
+        category="transportation",
+        keywords=[
+            "IFTA REPORT", "FUEL TAX RETURN",
+            "INTERNATIONAL FUEL TAX AGREEMENT",
+            "IFTA-100", "IFTA RETURN", "QUARTERLY FUEL TAX",
+            "jurisdiction", "state", "province", "member jurisdiction",
+            "total miles", "taxable miles", "distance traveled",
+            "total kilometers", "taxable distance",
+            "jurisdictional miles",
+            "fuel purchased", "gallons pumped", "gallons consumed",
+            "tax paid gallons", "fuel consumed", "fuel usage",
+            "total fuel", "gallons used",
+            "mpg", "fleet average mpg", "kpl", "average fuel economy",
+            "tax due", "refund", "net amount", "tax payable/(refund)",
+        ],
+        fields=[
+            "jurisdiction", "total_miles", "tax_paid_gallons",
+            "mpg", "net_tax_due",
+        ],
+    ),
+    # ── 35. DOT Hours of Service Logs ───────────────────────────────────
+    "dot_hos": DocTypeDefinition(
+        id=35,
+        display_name="DOT Hours of Service Logs",
+        category="transportation",
+        keywords=[
+            "DRIVER'S DAILY LOG", "HOURS OF SERVICE",
+            "RECORD OF DUTY STATUS", "ELD REPORT", "RODS",
+            "HOS LOG", "FMCSA",
+            "driver id", "driver name", "license number", "employee id",
+            "driving", "drive time", "hours driving",
+            "on duty", "on duty not driving", "off duty",
+            "sleeper berth", "rest hours",
+            "line 1", "line 3", "line 4",
+            "11 hour rule", "14 hour rule",
+            "hours of service violations", "hos violations",
+            "11 hour rule exceeded", "14 hour rule exceeded",
+        ],
+        fields=[
+            "driver_id", "driving_hours", "on_duty_hours",
+            "off_duty_hours", "violations",
+        ],
+    ),
+    # ── 36. Fleet Utilization Report ────────────────────────────────────
+    "fleet_utilization": DocTypeDefinition(
+        id=36,
+        display_name="Fleet Utilization Report",
+        category="transportation",
+        keywords=[
+            "FLEET UTILIZATION", "ASSET UTILIZATION",
+            "FLEET PRODUCTIVITY", "ODOMETER REPORT",
+            "vehicle id", "truck id", "fleet number", "asset id",
+            "utilization %", "active days %", "vehicle utilization",
+            "miles driven", "mileage", "total miles", "distance",
+            "idle time", "idling hours", "engine idle time",
+            "truck uptime", "vehicle activity",
+        ],
+        fields=[
+            "vehicle_id", "utilization_percent", "miles_driven",
+            "idle_time", "fuel_consumed",
+        ],
+    ),
+    # ── 37. Lane/Route Profitability ────────────────────────────────────
+    "lane_profitability": DocTypeDefinition(
+        id=37,
+        display_name="Lane/Route Profitability",
+        category="transportation",
+        keywords=[
+            "LANE PROFITABILITY", "ROUTE MARGIN",
+            "NET REVENUE PER LANE", "LOAD PROFITABILITY",
+            "FREIGHT MARGIN", "TRIP P&L",
+            "lane", "lane id", "route", "shipping lane",
+            "origin-destination", "o/d pair",
+            "revenue per mile", "rpm", "rate per mile",
+            "revenue/mile",
+            "cost per mile", "cpm", "operating cost/mile",
+            "total cost per mile",
+            "deadhead miles", "empty miles", "non-revenue miles",
+            "lane profit", "contribution margin", "net margin per mile",
+            "freight margin", "load profitability",
+        ],
+        fields=[
+            "lane_id", "revenue_per_mile", "cost_per_mile",
+            "deadhead", "margin",
+        ],
+    ),
+    # ── 38. Fuel Surcharge Schedule ─────────────────────────────────────
+    "fuel_surcharge": DocTypeDefinition(
+        id=38,
+        display_name="Fuel Surcharge Schedule",
+        category="transportation",
+        keywords=[
+            "FUEL SURCHARGE SCHEDULE", "FSC MATRIX",
+            "FUEL ADJUSTMENT FACTOR", "SURCHARGE TABLE",
+            "DOE DIESEL AVERAGE",
+            "fuel price", "peg price", "fuel index price",
+            "fuel index source", "doe average",
+            "price per gallon",
+            "surcharge per mile", "fsc rate", "surcharge percentage",
+            "adjustment factor", "surcharge rate",
+            "effective date", "week of", "effective period",
+        ],
+        fields=[
+            "fuel_price_range", "surcharge_rate", "effective_date",
+            "index_used",
+        ],
+    ),
+    # ── 39. Tip Reporting (Form 8027) ───────────────────────────────────
+    "tip_reporting": DocTypeDefinition(
+        id=39,
+        display_name="Tip Reporting (Form 8027)",
+        category="hospitality",
+        keywords=[
+            "FORM 8027", "TIP INCOME REPORT",
+            "EMPLOYER'S ANNUAL INFORMATION RETURN OF TIP INCOME",
+            "REPORT OF TIP INCOME", "ALLOCATED TIPS",
+            "GROSS RECEIPTS FROM FOOD",
+            "GROSS RECEIPTS FROM FOOD OR BEVERAGES",
+            "name of establishment", "restaurant name", "business name",
+            "establishment_name",
+            "charged tips", "credit card tips", "tips on charge receipts",
+            "cash tips", "direct cash tips",
+            "allocated tips", "allocated tip income",
+            "gross receipts", "food and beverage sales",
+            "gross sales", "total sales", "total revenue",
+            "tip shortfall", "shortfall",
+        ],
+        fields=[
+            "establishment_name", "gross_receipts", "charged_tips",
+            "cash_tips", "allocated_tips",
+        ],
+    ),
+    # ── 40. Daily Revenue Report ────────────────────────────────────────
+    "daily_revenue": DocTypeDefinition(
+        id=40,
+        display_name="Daily Revenue Report",
+        category="hospitality",
+        keywords=[
+            "DAILY SALES REPORT", "DSR", "FLASH REPORT",
+            "DAILY REVENUE TRACKER", "NIGHT AUDIT",
+            "NIGHT AUDIT REPORT", "POS SUMMARY",
+            "DAILY HIGHLIGHTS",
+            "date", "business day", "sales date",
+            "gross sales", "net sales", "total revenue", "daily revenue",
+            "discounts", "comps", "voids",
+            "promotional allowances", "promos",
+            "cash", "credit", "amex", "visa", "mastercard",
+            "credit card sales", "cash sales", "tender summary",
+            "covers", "guest count", "transactions", "customer count",
+            "net revenue", "sales net of discounts",
+        ],
+        fields=[
+            "date", "gross_sales", "discounts", "net_sales",
+            "payment_methods", "covers",
+        ],
+    ),
+    # ── 41. Departmental P&L ────────────────────────────────────────────
+    "departmental_pl": DocTypeDefinition(
+        id=41,
+        display_name="Departmental P&L",
+        category="operations",
+        keywords=[
+            "DEPARTMENTAL INCOME STATEMENT", "PROFIT AND LOSS BY DEPARTMENT",
+            "SEGMENT REPORTING", "COST CENTER REPORT",
+            "CONTRIBUTION BY DEPARTMENT", "DIVISIONAL P&L",
+            "department", "division", "cost center", "business unit",
+            "segment", "department_name",
+            "revenue", "sales", "departmental sales", "segment revenue",
+            "intercompany revenue",
+            "direct expenses", "departmental expenses",
+            "controllable expenses",
+            "contribution margin", "departmental income",
+            "departmental contribution", "segment profit",
+        ],
+        fields=[
+            "department_name", "dept_revenue", "direct_expenses",
+            "contribution_margin",
+        ],
+    ),
+    # ── 42. Balance Sheet ───────────────────────────────────────────────
+    "balance_sheet": DocTypeDefinition(
+        id=42,
+        display_name="Balance Sheet",
+        category="core_financial",
+        keywords=[
+            "CONSOLIDATED BALANCE SHEETS",
+            "STATEMENT OF FINANCIAL POSITION",
+            "ASSETS LIABILITIES AND EQUITY",
+            "LIABILITIES AND STOCKHOLDERS' EQUITY",
+            "cash", "accounts receivable", "inventory",
+            "prepaid expenses", "property plant and equipment",
+            "accumulated depreciation", "intangible assets",
+            "accounts payable", "accrued liabilities",
+            "short-term debt", "long-term debt", "deferred revenue",
+            "total equity", "shareholders' equity",
+            "retained earnings",
+            "total assets", "total liabilities",
+        ],
+        fields=[
+            "Assets", "Liabilities", "Equity",
+        ],
+    ),
+    # ── 43. Income Statement ────────────────────────────────────────────
+    "income_statement": DocTypeDefinition(
+        id=43,
+        display_name="Income Statement",
+        category="core_financial",
+        keywords=[
+            "CONSOLIDATED STATEMENTS OF OPERATIONS",
+            "STATEMENTS OF INCOME", "STATEMENT OF EARNINGS",
+            "PROFIT AND LOSS", "P&L",
+            "revenue", "sales", "net sales", "total revenue",
+            "cost of goods sold", "cogs", "cost of sales",
+            "gross profit", "operating expenses",
+            "marketing", "payroll", "rent", "depreciation",
+            "amortization", "interest expense",
+            "operating income", "net income", "net profit",
+            "net earnings", "ebitda",
+        ],
+        fields=[
+            "revenue", "cogs", "operating_expenses",
+            "net_income",
+        ],
+    ),
+    # ── 44. Cash Flow Statement ─────────────────────────────────────────
+    "cash_flow_statement": DocTypeDefinition(
+        id=44,
+        display_name="Cash Flow Statement",
+        category="core_financial",
+        keywords=[
+            "CONSOLIDATED STATEMENTS OF CASH FLOWS",
+            "STATEMENT OF CASH FLOWS", "CASH FLOW STATEMENT",
+            "net cash provided by operating activities",
+            "cash from operations", "operating activities",
+            "cash used in investing activities", "investing activities",
+            "cash from investing",
+            "cash provided by financing", "financing activities",
+            "cash from financing activities",
+            "net change in cash", "net increase/(decrease)",
+            "capital expenditures", "purchase of property and equipment",
+            "capex",
+        ],
+        fields=[
+            "operating_cash_flow", "investing_cash_flow",
+            "financing_cash_flow", "capex",
+        ],
+    ),
+    # ── 45. Statement of Shareholders' Equity ──────────────────────────
+    "shareholders_equity": DocTypeDefinition(
+        id=45,
+        display_name="Statement of Shareholders' Equity",
+        category="core_financial",
+        keywords=[
+            "STATEMENT OF STOCKHOLDERS EQUITY",
+            "CHANGES IN EQUITY", "RETAINED EARNINGS STATEMENT",
+            "CONSOLIDATED STATEMENT OF CHANGES IN EQUITY",
+            "beginning balance", "balance at beginning of period",
+            "opening equity", "beginning equity",
+            "net income", "comprehensive income", "net profit",
+            "dividends", "distributions", "dividends declared",
+            "issuance of common stock", "exercise of options",
+            "exercise of stock options",
+            "share-based compensation", "stock-based compensation",
+            "repurchase", "treasury stock", "share buybacks",
+            "repurchase of common stock",
+            "ending balance", "balance at end of period", "closing equity",
+        ],
+        fields=[
+            "beginning_equity", "net_income", "dividends",
+            "stock_issuance", "stock_repurchase", "ending_equity",
+        ],
+    ),
+    # ── 46. Budget vs. Actuals (BvA) Report ─────────────────────────────
+    "budget_vs_actuals": DocTypeDefinition(
+        id=46,
+        display_name="Budget vs. Actuals (BvA) Report",
+        category="budgeting",
+        keywords=[
+            "BUDGET VS ACTUAL", "VARIANCE REPORT",
+            "BVA", "FORECAST VS ACTUAL", "PLAN VS ACTUAL",
+            "MANAGEMENT REPORT", "MONTHLY PERFORMANCE",
+            "actual", "budget", "plan", "forecast", "target",
+            "budgeted amount", "current period actual",
+            "mtd actual", "ytd actual", "actuals",
+            "variance $", "variance %", "var", "diff",
+            "over/under", "over/(under)", "amount variance",
+            "percentage variance", "% diff",
+            "favorable", "unfavorable",
+        ],
+        fields=[
+            "gl_account", "actual_amount", "budget_amount",
+            "variance_amount", "variance_percent",
+        ],
+    ),
+    # ── 47. Aged Accounts Receivable (AR) Report ───────────────────────
+    "aged_ar": DocTypeDefinition(
+        id=47,
+        display_name="Aged Accounts Receivable (AR) Report",
+        category="credit",
+        keywords=[
+            "AGED RECEIVABLES", "AR AGING",
+            "AGED TRIAL BALANCE RECEIVABLES",
+            "OPEN INVOICE REPORT", "RECEIVABLES AGING SUMMARY",
+            "customer", "client", "customer name", "client",
+            "current", "not due", "<30", "<30 days",
+            "0-30", "1-30 days", "30-60", "31-60",
+            "60-90", "61-90", "90+", "over 90", ">90",
+            "past due 90+", "91+ days",
+            "total receivable", "total due", "balance",
+            "amount due", "open invoice report",
+        ],
+        fields=[
+            "customer_name", "current_bucket", "bucket_30_60",
+            "bucket_60_90", "bucket_90_plus", "total_due",
+        ],
+    ),
+    # ── 48. Aged Accounts Payable (AP) Report ──────────────────────────
+    "aged_ap": DocTypeDefinition(
+        id=48,
+        display_name="Aged Accounts Payable (AP) Report",
+        category="credit",
+        keywords=[
+            "AGED PAYABLES", "AP AGING",
+            "AGED TRIAL BALANCE PAYABLES", "VENDOR AGING",
+            "PAYABLES AGING SUMMARY",
+            "vendor", "supplier", "vendor name",
+            "current", "not due", "<30 days",
+            "0-30", "30-60", "31-60", "60-90", "61-90",
+            "90+", "over 90", ">90", ">90 days",
+            "total accounts payable", "balance",
+            "amount payable", "total liability",
+        ],
+        fields=[
+            "vendor_name", "current_bucket", "bucket_30_60",
+            "bucket_60_90", "bucket_90_plus",
+        ],
+    ),
+    # ── 49. Headcount & Payroll Register ────────────────────────────────
+    "headcount_payroll": DocTypeDefinition(
+        id=49,
+        display_name="Headcount & Payroll Register",
+        category="hr",
+        keywords=[
+            "HEADCOUNT REPORT", "PAYROLL REGISTER",
+            "CENSUS", "FTE REPORT", "EMPLOYEE LIST",
+            "SALARY ROSTER", "EMPLOYEE CENSUS",
+            "PAYROLL SUMMARY",
+            "employee id", "eeid", "employee number", "personnel number",
+            "file number",
+            "department", "cost center", "division", "unit",
+            "annual salary", "base salary", "base rate", "hourly rate",
+            "gross pay", "gross wages", "total earnings", "total compensation",
+            "net pay", "net amount", "take home", "take home pay",
+            "check amount",
+            "deductions", "taxes withheld", "401k",
+            "benefit deductions", "withholdings",
+        ],
+        fields=[
+            "employee_id", "department", "salary", "gross_pay",
+            "net_pay", "deductions",
+        ],
+    ),
+    # ── 50. Debt Schedule & Covenant Compliance ────────────────────────
+    "debt_covenant": DocTypeDefinition(
+        id=50,
+        display_name="Debt Schedule & Covenant Compliance",
+        category="treasury",
+        keywords=[
+            "COMPLIANCE CERTIFICATE", "COVENANT CALCULATIONS",
+            "DEBT SERVICE COVERAGE RATIO", "LEVERAGE RATIO",
+            "BORROWING BASE CERTIFICATE", "FIXED CHARGE COVERAGE",
+            "LOAN AGREEMENT COMPLIANCE", "DSCR",
+            "covenant", "financial requirement",
+            "debt service coverage ratio", "current ratio",
+            "leverage ratio", "fixed charge coverage",
+            "minimum", "maximum", "threshold", "required",
+            "calculated", "actual", "current period",
+            "pass/fail", "in compliance", "compliant", "met",
+            "outstanding principal", "loan balance",
+            "debt outstanding", "ending debt balance", "debt schedule",
+        ],
+        fields=[
+            "covenant_name", "required_value", "actual_value",
+            "compliance_status", "debt_balance",
+        ],
+    ),
+    # ── 51. Tax Provision Workpapers (ASC 740) ──────────────────────────
+    "tax_provision": DocTypeDefinition(
+        id=51,
+        display_name="Tax Provision Workpapers (ASC 740)",
+        category="tax",
+        keywords=[
+            "TAX PROVISION", "ASC 740", "INCOME TAX PROVISION",
+            "DEFERRED TAX ASSET", "DEFERRED TAX LIABILITY",
+            "EFFECTIVE TAX RATE", "FAS 109",
+            "pre-tax income", "book income", "income before taxes",
+            "permanent differences", "non-deductible",
+            "non-deductible expenses", "meals and entertainment",
+            "non-taxable income",
+            "temporary differences", "timing differences",
+            "depreciation adjustment", "accruals",
+            "current expense", "current provision",
+            "current tax expense", "current taxes payable",
+            "deferred expense", "deferred provision",
+            "deferred tax expense", "change in deferred taxes",
+        ],
+        fields=[
+            "pre_tax_income", "permanent_diff", "temporary_diff",
+            "current_tax_expense", "deferred_tax_expense",
+        ],
+    ),
+    # ── 52. Capital Expenditure (CapEx) Budget ──────────────────────────
+    "capex_budget": DocTypeDefinition(
+        id=52,
+        display_name="Capital Expenditure (CapEx) Budget",
+        category="budgeting",
+        keywords=[
+            "CAPEX BUDGET", "CAPITAL PLAN", "INVESTMENT BUDGET",
+            "CAPITAL SPENDING PLAN", "CAPITAL FORECAST",
+            "PROJECT BUDGET", "LONG-RANGE CAPITAL PLAN",
+            "project", "project name", "initiative", "item description",
+            "investment", "capex category", "asset type", "category",
+            "budget", "budgeted amount", "approved amount",
+            "authorized amount", "total cost",
+            "spend to date", "incurred", "incurred to date",
+            "cumulative spend", "actuals",
+            "remaining budget", "remaining", "remaining funds",
+            "available", "balance",
+        ],
+        fields=[
+            "project_name", "budget_amount", "spend_to_date",
+            "remaining_budget", "asset_category",
+        ],
+    ),
+    # ── 53. Insurance Policy Declarations ───────────────────────────────
+    "insurance_declarations": DocTypeDefinition(
+        id=53,
+        display_name="Insurance Policy Declarations",
+        category="risk",
+        keywords=[
+            "DECLARATIONS PAGE", "CERTIFICATE OF INSURANCE",
+            "POLICY DECLARATIONS", "POLICY NUMBER",
+            "SCHEDULE OF COVERAGE", "SCHEDULE OF COVERAGES AND LIMITS",
+            "INSURANCE POLICY", "LIMITS OF LIABILITY",
+            "policy number", "policy no", "policy id",
+            "named insured", "insured", "policyholder",
+            "policy period", "effective dates",
+            "policy effective date", "policy expiration date",
+            "premium", "total premium", "annual premium",
+            "policy cost", "cost",
+            "coverage", "coverage part", "type of insurance",
+            "General Liability", "Auto Liability",
+            "Workers Compensation", "Property",
+            "occurrence limit", "aggregate limit",
+            "each occurrence limit", "policy limit", "coverage limit",
+            "limit of liability", "limits of liability",
+        ],
+        fields=[
+            "policy_number", "insured_name", "policy_period",
+            "premium", "coverage_type", "limit_amount",
+        ],
+    ),
+}
+# ============================================================================
+# CATEGORY DISPLAY NAMES (for frontend grouping)
+# ============================================================================
+CATEGORY_NAMES: Dict[str, str] = {
+    "compliance": "Compliance & Audit",
+    "saas_metrics": "SaaS / Subscription Metrics",
+    "revenue_recognition": "Revenue Recognition",
+    "sales": "Sales & Bookings",
+    "tax": "Tax & Filings",
+    "inventory": "Inventory Management",
+    "retail": "Retail & Merchandising",
+    "treasury": "Treasury & Cash Management",
+    "real_estate": "Real Estate",
+    "hospitality": "Hospitality & F&B",
+    "accounting": "Accounting & Leases",
+    "manufacturing": "Manufacturing & Production",
+    "procurement": "Procurement & Purchasing",
+    "safety": "Health, Safety & Environmental",
+    "oil_gas": "Oil, Gas & Energy",
+    "utilities": "Utilities & Regulated",
+    "transportation": "Transportation & Fleet",
+    "core_financial": "Core Financial Statements",
+    "budgeting": "Budgeting & Planning",
+    "credit": "Credit & Collections",
+    "hr": "HR & Payroll",
+    "operations": "Operations & Segments",
+    "risk": "Risk & Insurance",
+    "general": "General",
+}
+# ============================================================================
+# LEARNED KEYWORDS LOADER
+# ============================================================================
+_LEARNED_KEYWORDS_PATH = os.path.join(
+    os.path.dirname(__file__), "learned_keywords.json"
+)
+def load_learned_keywords() -> Dict[str, List[str]]:
+    """Load admin-approved learned keywords from JSON file."""
+    if not os.path.exists(_LEARNED_KEYWORDS_PATH):
+        return {}
+    try:
+        with open(_LEARNED_KEYWORDS_PATH, "r") as f:
+            return json.load(f)
+    except (json.JSONDecodeError, IOError) as e:
+        logger.warning(f"Failed to load learned keywords: {e}")
+        return {}
+def save_learned_keywords(learned: Dict[str, List[str]]) -> None:
+    """Persist learned keywords to JSON file."""
+    try:
+        with open(_LEARNED_KEYWORDS_PATH, "w") as f:
+            json.dump(learned, f, indent=2)
+        logger.info(f"Saved learned keywords to {_LEARNED_KEYWORDS_PATH}")
+    except IOError as e:
+        logger.error(f"Failed to save learned keywords: {e}")
+def get_effective_keywords(doc_type_key: str) -> List[str]:
+    """
+    Get the full keyword list for a doc type, merging base + learned.
+    Returns:
+        Combined list of base keywords + any learned keywords for this type.
+    """
+    base = DOC_TYPE_REGISTRY.get(doc_type_key)
+    if not base:
+        return []
+    keywords = list(base.keywords)
+    # Merge learned keywords
+    learned = load_learned_keywords()
+    if doc_type_key in learned:
+        for kw in learned[doc_type_key]:
+            if kw not in keywords:
+                keywords.append(kw)
+    return keywords
+def get_all_doc_types_summary() -> List[Dict]:
+    """
+    Get a summary list of all doc types for the frontend.
+    Returns:
+        List of dicts with id, key, display_name, category, keyword_count.
+    """
+    learned = load_learned_keywords()
+    result = []
+    for key, dt in DOC_TYPE_REGISTRY.items():
+        learned_count = len(learned.get(key, []))
+        result.append({
+            "id": dt.id,
+            "key": key,
+            "display_name": dt.display_name,
+            "category": dt.category,
+            "category_name": CATEGORY_NAMES.get(dt.category, dt.category),
+            "keyword_count": len(dt.keywords) + learned_count,
+            "field_count": len(dt.fields),
+            "learned_keyword_count": learned_count,
+        })
+    return sorted(result, key=lambda x: x["id"])

app/services/ingestion/dolphin/__init__.py ADDED Viewed

	@@ -0,0 +1,158 @@

+"""
+Dolphin PDF Extraction Module — Hybrid Architecture.
+Uses ByteDance Dolphin-v2 for advanced document layout analysis,
+classification, and element extraction, combined with pdfplumber
+for gap-filling and validation.
+## Quick Check
+```python
+from app.services.ingestion.dolphin import is_dolphin_available, ensure_model_downloaded
+if is_dolphin_available():
+    from app.services.ingestion.dolphin.client import DolphinClient
+    client = DolphinClient()
+```
+"""
+import os
+import logging
+from typing import Optional
+logger = logging.getLogger(__name__)
+# Default model storage location (relative to backend root)
+DEFAULT_MODEL_DIR = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)),
+    "..", "..", "..", "..", "models", "dolphin-v2"
+)
+_dolphin_available: Optional[bool] = None
+def _detect_device() -> str:
+    """Auto-detect best available compute device: cuda > mps > cpu."""
+    try:
+        import torch
+        if torch.cuda.is_available():
+            logger.info("Dolphin device: CUDA GPU detected")
+            return "cuda"
+        elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+            logger.info("Dolphin device: Apple MPS (Metal) detected")
+            return "mps"
+    except ImportError:
+        pass
+    logger.info("Dolphin device: CPU mode")
+    return "cpu"
+def _get_model_path() -> str:
+    """Resolve model path from config or default."""
+    try:
+        from app.core.config import settings
+        if settings.DOLPHIN_MODEL_PATH:
+            return settings.DOLPHIN_MODEL_PATH
+    except Exception:
+        pass
+    return os.path.abspath(DEFAULT_MODEL_DIR)
+def is_dolphin_available() -> bool:
+    """
+    Check if Dolphin model and dependencies are installed.
+    Result is cached after first check.
+    """
+    global _dolphin_available
+    if _dolphin_available is not None:
+        return _dolphin_available
+    # If remote API is configured, we consider Dolphin available
+    # (The remote worker manages the model)
+    from app.core.config import settings
+    if settings.DOLPHIN_API_URL:
+        _dolphin_available = True
+        return True
+    try:
+        import torch  # noqa: F401
+        import transformers  # noqa: F401
+        from PIL import Image  # noqa: F401
+        model_path = _get_model_path()
+        if os.path.isdir(model_path):
+            # Check for key model files
+            has_config = os.path.exists(os.path.join(model_path, "config.json"))
+            has_weights = (
+                os.path.exists(os.path.join(model_path, "model.safetensors"))
+                or os.path.exists(os.path.join(model_path, "pytorch_model.bin"))
+                or any(f.startswith("model-") for f in os.listdir(model_path) if f.endswith(".safetensors"))
+            )
+            _dolphin_available = has_config and has_weights
+        else:
+            _dolphin_available = False
+    except ImportError as e:
+        logger.debug(f"Dolphin dependencies not installed: {e}")
+        _dolphin_available = False
+    logger.info(f"Dolphin availability: {_dolphin_available}")
+    return _dolphin_available
+def ensure_model_downloaded(force: bool = False) -> str:
+    """
+    Download Dolphin-v2 model from HuggingFace if not already present.
+    Args:
+        force: If True, re-download even if model exists
+    Returns:
+        Path to the downloaded model directory
+    """
+    model_path = _get_model_path()
+    if not force and os.path.isdir(model_path):
+        config_path = os.path.join(model_path, "config.json")
+        if os.path.exists(config_path):
+            logger.info(f"Dolphin model already present at {model_path}")
+            return model_path
+    logger.info("Downloading Dolphin-v2 model from HuggingFace...")
+    try:
+        from huggingface_hub import snapshot_download
+        os.makedirs(model_path, exist_ok=True)
+        snapshot_download(
+            repo_id="ByteDance/Dolphin-v2",
+            local_dir=model_path,
+            local_dir_use_symlinks=False,
+        )
+        logger.info(f"Dolphin-v2 model downloaded to {model_path}")
+        # Invalidate cache so next check picks up the new model
+        global _dolphin_available
+        _dolphin_available = None
+        return model_path
+    except Exception as e:
+        logger.error(f"Failed to download Dolphin model: {e}")
+        raise RuntimeError(
+            f"Dolphin model download failed: {e}. "
+            "Install huggingface-hub and ensure network access, "
+            "or manually download to: {model_path}"
+        ) from e
+def get_device() -> str:
+    """Get configured or auto-detected device."""
+    try:
+        from app.core.config import settings
+        device = getattr(settings, "DOLPHIN_DEVICE", "auto")
+        if device != "auto":
+            return device
+    except Exception:
+        pass
+    return _detect_device()

app/services/ingestion/dolphin/classifier.py ADDED Viewed

	@@ -0,0 +1,278 @@

+"""
+Document Classifier — 53-Type Keyword Classification System
+============================================================
+Identifies financial document types from parsed content using keyword
+matching against the doc_keywords registry. Uses an 80% keyword match
+threshold for high-confidence classification with a 3-tier fallback.
+Tiers:
+  ≥80%  →  High confidence (classified)
+  50-79% → Low confidence  (classified, flagged needs_review)
+  <50%  →  No match        (general_financial fallback)
+"""
+import re
+import logging
+from typing import List, Dict, Tuple, Optional
+from dataclasses import dataclass, field
+from ..doc_keywords import (
+    DOC_TYPE_REGISTRY,
+    get_effective_keywords,
+    get_all_doc_types_summary,
+)
+logger = logging.getLogger(__name__)
+# ============================================================================
+# CLASSIFICATION THRESHOLDS
+# ============================================================================
+HIGH_CONFIDENCE_THRESHOLD = 0.80   # ≥80% → classified
+LOW_CONFIDENCE_THRESHOLD  = 0.50   # 50-79% → flagged needs_review
+@dataclass
+class DocumentClassification:
+    """Classification result for a parsed document."""
+    doc_type: str                 # Internal key (e.g. "rent_roll")
+    doc_type_display: str = ""    # Human name (e.g. "Rent Roll")
+    confidence: float = 0.0       # 0.0 - 1.0
+    match_percentage: float = 0.0 # % of keywords matched (0-100)
+    needs_review: bool = False    # True if 50-79% match
+    matched_keywords: List[str] = field(default_factory=list)
+    extractable_fields: List[str] = field(default_factory=list)
+    detected_sections: List[str] = field(default_factory=list)
+    extraction_method: str = "dolphin_hybrid"
+    secondary_types: List[str] = field(default_factory=list)
+    category: str = "general"
+class DocumentClassifier:
+    """
+    Classifies financial documents using 53-type keyword registry.
+    Scans combined Dolphin + pdfplumber text against all registered doc types,
+    counts keyword hits, and applies the 80% match threshold.
+    Usage:
+        classifier = DocumentClassifier()
+        result = classifier.classify(combined_text)
+    """
+    @staticmethod
+    def classify(
+        text_content: str,
+        dolphin_sections: Optional[List[Dict]] = None,
+        dolphin_elements: Optional[list] = None,
+    ) -> DocumentClassification:
+        """
+        Classify the document based on combined extracted text.
+        Args:
+            text_content: Full text extracted from BOTH engines (Dolphin + pdfplumber)
+            dolphin_sections: Layout sections from Dolphin (if available)
+            dolphin_elements: Parsed elements from Dolphin (if available)
+        Returns:
+            DocumentClassification with type, confidence, matched keywords, and fields
+        """
+        if not text_content:
+            return DocumentClassification(
+                doc_type="general_financial",
+                doc_type_display="General Financial",
+                confidence=0.0,
+                extraction_method="dolphin_hybrid",
+            )
+        text_lower = text_content.lower()
+        # ── Score every registered doc type ──────────────────────────
+        scores: Dict[str, Dict] = {}
+        for doc_key, doc_def in DOC_TYPE_REGISTRY.items():
+            # Get effective keywords (base + learned)
+            all_keywords = get_effective_keywords(doc_key)
+            if not all_keywords:
+                continue
+            matched = []
+            for kw in all_keywords:
+                if kw.lower() in text_lower:
+                    matched.append(kw)
+            total = len(all_keywords)
+            pct = (len(matched) / total * 100) if total > 0 else 0.0
+            scores[doc_key] = {
+                "matched": matched,
+                "total": total,
+                "percentage": pct,
+                "fields": doc_def.fields,
+                "display_name": doc_def.display_name,
+                "category": doc_def.category,
+            }
+        # ── Find top match ───────────────────────────────────────────
+        if not scores:
+            return DocumentClassification(
+                doc_type="general_financial",
+                doc_type_display="General Financial",
+                confidence=0.1,
+                extraction_method="dolphin_hybrid",
+            )
+        best_key = max(scores, key=lambda k: scores[k]["percentage"])
+        best = scores[best_key]
+        best_pct = best["percentage"]
+        # ── Apply 3-tier threshold ───────────────────────────────────
+        if best_pct >= HIGH_CONFIDENCE_THRESHOLD * 100:
+            # Tier 1: High confidence
+            confidence = min(best_pct / 100.0, 1.0)
+            needs_review = False
+            doc_type = best_key
+        elif best_pct >= LOW_CONFIDENCE_THRESHOLD * 100:
+            # Tier 2: Low confidence — classify but flag
+            confidence = best_pct / 100.0
+            needs_review = True
+            doc_type = best_key
+        else:
+            # Tier 3: No match — fallback
+            doc_type = "general_financial"
+            confidence = max(best_pct / 100.0, 0.1)
+            needs_review = False
+            logger.info(
+                f"No doc type matched at ≥50%. Best: {best_key} "
+                f"({best_pct:.1f}%). Falling back to general_financial."
+            )
+            return DocumentClassification(
+                doc_type="general_financial",
+                doc_type_display="General Financial",
+                confidence=round(confidence, 3),
+                match_percentage=round(best_pct, 1),
+                needs_review=False,
+                matched_keywords=[],
+                extractable_fields=[],
+                extraction_method="dolphin_hybrid",
+                secondary_types=[],
+                category="general",
+            )
+        # ── Gather secondary types (other types with decent matches) ─
+        secondary = [
+            k for k, v in scores.items()
+            if v["percentage"] >= 30.0 and k != doc_type
+        ]
+        # Sort secondaries by match percentage descending
+        secondary.sort(key=lambda k: scores[k]["percentage"], reverse=True)
+        logger.info(
+            f"Classified as '{doc_type}' ({best['display_name']}) "
+            f"with {best_pct:.1f}% keyword match "
+            f"({len(best['matched'])}/{best['total']} keywords). "
+            f"needs_review={needs_review}"
+        )
+        return DocumentClassification(
+            doc_type=doc_type,
+            doc_type_display=best["display_name"],
+            confidence=round(confidence, 3),
+            match_percentage=round(best_pct, 1),
+            needs_review=needs_review,
+            matched_keywords=best["matched"],
+            extractable_fields=best["fields"],
+            detected_sections=[],  # Populated below if dolphin sections available
+            extraction_method="dolphin_hybrid",
+            secondary_types=secondary[:5],  # Top 5 secondary matches
+            category=best["category"],
+        )
+    @staticmethod
+    def get_financial_statement_types(classification: DocumentClassification) -> List[str]:
+        """
+        Return the list of financial statement types that should be
+        extracted from this document.
+        For core financial statements, returns the matching type.
+        For 10-K/10-Q and general_financial, returns all three.
+        For specialized doc types, returns relevant statement types
+        plus any secondaries detected.
+        """
+        # Comprehensive types always extract all three
+        comprehensive_types = {"10-K", "10-Q", "general_financial"}
+        if classification.doc_type in comprehensive_types:
+            return ["income", "balance", "cash_flow"]
+        # Core financial statement type mappings
+        type_map = {
+            "income_statement": ["income"],
+            "balance_sheet": ["balance"],
+            "cash_flow_statement": ["cash_flow"],
+            "bank_statement": ["cash_flow"],
+            "invoice": ["income"],
+            "tax_return": ["income"],
+            # Specialized types that primarily contain income-like data
+            "arr_mrr_waterfall": ["income"],
+            "deferred_revenue_schedule": ["income", "balance"],
+            "cac_ltv_model": ["income"],
+            "noi_statement": ["income"],
+            "cogm": ["income"],
+            "production_variance": ["income"],
+            "departmental_pl": ["income"],
+            "daily_revenue": ["income"],
+            "budget_vs_actuals": ["income"],
+            # Balance-sheet focused
+            "rent_roll": ["income", "balance"],
+            "asc_842_lease": ["balance"],
+            "fixed_asset_rollforward": ["balance"],
+            "wip_valuation": ["balance"],
+            "aged_ar": ["balance"],
+            "aged_ap": ["balance"],
+            "debt_covenant": ["balance"],
+            "aro_schedule": ["balance"],
+            "reserve_report": ["balance"],
+            # Cash flow focused
+            "thirteen_week_cash_flow": ["cash_flow"],
+            "capex_reserve": ["cash_flow", "balance"],
+            "capex_budget": ["cash_flow"],
+            # Equity
+            "shareholders_equity": ["balance"],
+        }
+        base = type_map.get(
+            classification.doc_type,
+            ["income", "balance", "cash_flow"],  # Default: extract all
+        )
+        # Add secondary types
+        for sec_type in classification.secondary_types:
+            extra = type_map.get(sec_type, [])
+            for e in extra:
+                if e not in base:
+                    base.append(e)
+        return base
+    @staticmethod
+    def classify_with_details(text_content: str) -> Dict:
+        """
+        Classify and return a full details dict for API responses.
+        Returns a JSON-serializable dict with all classification details.
+        """
+        result = DocumentClassifier.classify(text_content)
+        return {
+            "doc_type": result.doc_type,
+            "doc_type_display": result.doc_type_display,
+            "confidence": result.confidence,
+            "match_percentage": result.match_percentage,
+            "needs_review": result.needs_review,
+            "matched_keywords": result.matched_keywords,
+            "extractable_fields": result.extractable_fields,
+            "secondary_types": result.secondary_types,
+            "category": result.category,
+        }

app/services/ingestion/dolphin/client.py ADDED Viewed

	@@ -0,0 +1,393 @@

+"""
+Dolphin Client — Wraps the ByteDance Dolphin-v2 model for document parsing.
+Provides page-level, element-level, and layout parsing capabilities
+with automatic device selection (CUDA > MPS > CPU).
+"""
+import os
+import logging
+from typing import List, Dict, Any, Optional
+from dataclasses import dataclass, field
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Data classes for Dolphin outputs
+# ---------------------------------------------------------------------------
+@dataclass
+class DolphinElement:
+    """A single parsed element from a document page."""
+    element_type: str  # "text", "table", "formula", "figure", "code"
+    content: str       # Markdown or plain text content
+    bbox: Optional[List[float]] = None  # [x1, y1, x2, y2] bounding box
+    confidence: float = 1.0
+    page_number: int = 0
+    metadata: Dict[str, Any] = field(default_factory=dict)
+@dataclass
+class DolphinPageResult:
+    """Result from page-level parsing."""
+    page_number: int
+    markdown: str          # Full page rendered as Markdown
+    structured_json: Dict[str, Any] = field(default_factory=dict)
+    elements: List[DolphinElement] = field(default_factory=list)
+@dataclass
+class DolphinLayoutResult:
+    """Result from layout analysis."""
+    page_number: int
+    sections: List[Dict[str, Any]] = field(default_factory=list)   # [{type, bbox, label}]
+    reading_order: List[int] = field(default_factory=list)          # Element indices in reading order
+    doc_type_hint: str = "unknown"  # "digital" or "photographed"
+@dataclass
+class DolphinDocumentResult:
+    """Aggregated result for an entire PDF document."""
+    pages: List[DolphinPageResult] = field(default_factory=list)
+    layouts: List[DolphinLayoutResult] = field(default_factory=list)
+    full_markdown: str = ""
+    total_pages: int = 0
+class DolphinClient:
+    """
+    High-level client for Dolphin-v2 document parsing.
+    Acts as a factory: returns either a local model wrapper (if no API URL)
+    or a remote client (if API URL is configured).
+    """
+    @staticmethod
+    def create():
+        """
+        Factory method to create the appropriate Dolphin client.
+        Returns:
+            RemoteDolphinClient if DOLPHIN_API_URL is set
+            LocalDolphinClient (self) otherwise
+        """
+        from app.core.config import settings
+        if settings.DOLPHIN_API_URL:
+            from app.services.ingestion.dolphin.remote_client import RemoteDolphinClient
+            return RemoteDolphinClient()
+        return DolphinClient()
+    def __init__(
+        self,
+        model_path: Optional[str] = None,
+        device: Optional[str] = None,
+        max_batch_size: int = 4,
+    ):
+        from app.services.ingestion.dolphin import _get_model_path, get_device
+        self.model_path = model_path or _get_model_path()
+        self.device = device or get_device()
+        self.max_batch_size = max_batch_size
+        self._model = None
+        self._processor = None
+        logger.info(
+            f"DolphinClient initialized: model={self.model_path}, device={self.device}"
+        )
+    # ------------------------------------------------------------------
+    # Lazy model loading
+    # ------------------------------------------------------------------
+    def _ensure_loaded(self):
+        """Lazy-load model and processor on first use."""
+        if self._model is not None:
+            return
+        try:
+            import torch
+            from transformers import AutoModelForVision2Seq, AutoProcessor
+            logger.info(f"Loading Dolphin-v2 model from {self.model_path}...")
+            self._processor = AutoProcessor.from_pretrained(
+                self.model_path, trust_remote_code=True
+            )
+            self._model = AutoModelForVision2Seq.from_pretrained(
+                self.model_path,
+                trust_remote_code=True,
+                # CRITICAL: CPU does not support float16 — force float32 on CPU
+                torch_dtype=torch.float32 if self.device == "cpu" else torch.float16,
+            )
+            self._model.to(self.device)
+            self._model.eval()
+            logger.info("Dolphin-v2 model loaded successfully")
+        except Exception as e:
+            logger.error(f"Failed to load Dolphin model: {e}")
+            raise RuntimeError(f"Dolphin model loading failed: {e}") from e
+    # ------------------------------------------------------------------
+    # PDF → Images conversion
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _pdf_to_images(pdf_path: str) -> list:
+        """Convert PDF pages to PIL Images for Dolphin processing."""
+        try:
+            from pdf2image import convert_from_path
+            images = convert_from_path(pdf_path, dpi=200)
+            return images
+        except ImportError:
+            # Fallback: use pypdf + Pillow for basic conversion
+            logger.warning("pdf2image not installed, using fallback renderer")
+            return DolphinClient._pdf_to_images_fallback(pdf_path)
+    @staticmethod
+    def _pdf_to_images_fallback(pdf_path: str) -> list:
+        """Fallback PDF → image conversion using pypdf."""
+        from PIL import Image
+        import io
+        try:
+            from pypdf import PdfReader
+            reader = PdfReader(pdf_path)
+            images = []
+            for page in reader.pages:
+                # Extract any embedded images from the page
+                for img_key in page.images:
+                    img_data = img_key.data
+                    img = Image.open(io.BytesIO(img_data))
+                    images.append(img)
+                    break  # One image per page is enough
+            if not images:
+                # Create a blank placeholder if no images could be extracted
+                logger.warning("No images extracted from PDF pages, layout analysis may be limited")
+                for _ in reader.pages:
+                    img = Image.new("RGB", (1700, 2200), "white")
+                    images.append(img)
+            return images
+        except Exception as e:
+            logger.error(f"Fallback PDF image conversion failed: {e}")
+            return []
+    # ------------------------------------------------------------------
+    # Core parsing methods
+    # ------------------------------------------------------------------
+    def parse_page(self, image, page_number: int = 0) -> DolphinPageResult:
+        """
+        Parse a single page image into structured output.
+        Args:
+            image: PIL Image of the page
+            page_number: Page index (0-based)
+        Returns:
+            DolphinPageResult with markdown and structured elements
+        """
+        self._ensure_loaded()
+        try:
+            import torch
+            # Prepare input with page-level prompt
+            prompt = "<page_parsing>"
+            inputs = self._processor(
+                images=image, text=prompt, return_tensors="pt"
+            ).to(self.device)
+            with torch.no_grad():
+                outputs = self._model.generate(
+                    **inputs,
+                    max_new_tokens=4096,
+                    do_sample=False,
+                )
+            result_text = self._processor.batch_decode(
+                outputs, skip_special_tokens=True
+            )[0]
+            # Parse elements from the result
+            elements = self._parse_elements_from_text(result_text, page_number)
+            return DolphinPageResult(
+                page_number=page_number,
+                markdown=result_text,
+                structured_json={"raw_output": result_text},
+                elements=elements,
+            )
+        except Exception as e:
+            logger.error(f"Dolphin page parsing failed for page {page_number}: {e}")
+            return DolphinPageResult(
+                page_number=page_number,
+                markdown="",
+                elements=[],
+            )
+    def parse_layout(self, image, page_number: int = 0) -> DolphinLayoutResult:
+        """
+        Analyze layout/structure of a page image.
+        Returns section bounding boxes, reading order, and document type hint.
+        """
+        self._ensure_loaded()
+        try:
+            import torch
+            prompt = "<layout_parsing>"
+            inputs = self._processor(
+                images=image, text=prompt, return_tensors="pt"
+            ).to(self.device)
+            with torch.no_grad():
+                outputs = self._model.generate(
+                    **inputs,
+                    max_new_tokens=2048,
+                    do_sample=False,
+                )
+            result_text = self._processor.batch_decode(
+                outputs, skip_special_tokens=True
+            )[0]
+            sections = self._parse_layout_sections(result_text)
+            doc_type_hint = "digital"  # Dolphin detects this in stage 1
+            return DolphinLayoutResult(
+                page_number=page_number,
+                sections=sections,
+                reading_order=list(range(len(sections))),
+                doc_type_hint=doc_type_hint,
+            )
+        except Exception as e:
+            logger.error(f"Dolphin layout parsing failed for page {page_number}: {e}")
+            return DolphinLayoutResult(page_number=page_number)
+    def parse_document(self, pdf_path: str) -> DolphinDocumentResult:
+        """
+        Parse an entire PDF document — page-level + layout for all pages.
+        This is the main entry point for the hybrid parser.
+        Args:
+            pdf_path: Path to the PDF file
+        Returns:
+            DolphinDocumentResult with all pages parsed
+        """
+        images = self._pdf_to_images(pdf_path)
+        if not images:
+            logger.warning(f"No page images extracted from {pdf_path}")
+            return DolphinDocumentResult(total_pages=0)
+        pages = []
+        layouts = []
+        all_markdown = []
+        for i, image in enumerate(images):
+            logger.debug(f"Parsing page {i + 1}/{len(images)}")
+            # Page-level parsing (structured content)
+            page_result = self.parse_page(image, page_number=i)
+            pages.append(page_result)
+            all_markdown.append(page_result.markdown)
+            # Layout analysis (structure detection)
+            layout_result = self.parse_layout(image, page_number=i)
+            layouts.append(layout_result)
+        return DolphinDocumentResult(
+            pages=pages,
+            layouts=layouts,
+            full_markdown="\n\n---\n\n".join(all_markdown),
+            total_pages=len(images),
+        )
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _parse_elements_from_text(text: str, page_number: int) -> List[DolphinElement]:
+        """Parse Dolphin's text output into structured DolphinElement objects."""
+        elements = []
+        if not text:
+            return elements
+        import re
+        # Split by Markdown table blocks
+        table_pattern = re.compile(r"(\|.+\|(?:\n\|.+\|)*)", re.MULTILINE)
+        last_end = 0
+        for match in table_pattern.finditer(text):
+            # Text before table
+            pre_text = text[last_end:match.start()].strip()
+            if pre_text:
+                elements.append(DolphinElement(
+                    element_type="text",
+                    content=pre_text,
+                    page_number=page_number,
+                ))
+            # Table element
+            elements.append(DolphinElement(
+                element_type="table",
+                content=match.group(0),
+                page_number=page_number,
+            ))
+            last_end = match.end()
+        # Remaining text after last table
+        remaining = text[last_end:].strip()
+        if remaining:
+            elements.append(DolphinElement(
+                element_type="text",
+                content=remaining,
+                page_number=page_number,
+            ))
+        return elements
+    @staticmethod
+    def _parse_layout_sections(text: str) -> List[Dict[str, Any]]:
+        """Parse Dolphin layout output into section descriptors."""
+        sections = []
+        if not text:
+            return sections
+        import re
+        # Dolphin layout output typically contains bounding box coordinates
+        # Pattern: <section_type> [x1, y1, x2, y2]
+        bbox_pattern = re.compile(
+            r"(\w+[\w\s]*?)\s*\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]"
+        )
+        for match in bbox_pattern.finditer(text):
+            sections.append({
+                "type": match.group(1).strip(),
+                "bbox": [
+                    int(match.group(2)),
+                    int(match.group(3)),
+                    int(match.group(4)),
+                    int(match.group(5)),
+                ],
+            })
+        # If no bbox patterns found, treat each line as a section label
+        if not sections:
+            for line in text.strip().split("\n"):
+                line = line.strip()
+                if line:
+                    sections.append({"type": line, "bbox": []})
+        return sections

app/services/ingestion/dolphin/extractor.py ADDED Viewed

	@@ -0,0 +1,336 @@

+"""
+Dolphin Extractor — Extracts structured financial data from Dolphin's parsed output.
+Converts Dolphin's Markdown/JSON tables and text elements into
+key-value financial data using the existing DataMapper.
+"""
+import re
+import logging
+from typing import Dict, List, Any, Optional
+logger = logging.getLogger(__name__)
+class DolphinExtractor:
+    """
+    Extracts financial data from Dolphin's parsed output.
+    Works with DolphinPageResult and DolphinElement objects to produce
+    a flat dict of {field_name: value} pairs ready for FinancialReport
+    construction.
+    Usage:
+        extractor = DolphinExtractor()
+        data = extractor.extract(dolphin_result, doc_classification)
+    """
+    @staticmethod
+    def extract(
+        dolphin_result,  # DolphinDocumentResult
+        doc_classification=None,  # DocumentClassification
+    ) -> Dict[str, Any]:
+        """
+        Extract all financial data from a Dolphin document result.
+        Args:
+            dolphin_result: DolphinDocumentResult from client.parse_document()
+            doc_classification: Optional classification to guide extraction
+        Returns:
+            Dict of {standardized_field_name: float_value}
+        """
+        from app.services.ingestion.mappings import DataMapper
+        extracted = {}
+        tables_data = []
+        text_content_parts = []
+        for page in dolphin_result.pages:
+            for element in page.elements:
+                if element.element_type == "table":
+                    table_rows = DolphinExtractor._parse_markdown_table(
+                        element.content
+                    )
+                    tables_data.append(table_rows)
+                elif element.element_type == "text":
+                    text_content_parts.append(element.content)
+        # --- Strategy 1: Table Extraction ---
+        for table_rows in tables_data:
+            table_data = DolphinExtractor._extract_from_table_rows(
+                table_rows, DataMapper
+            )
+            # Only overwrite if we haven't seen this field yet
+            for k, v in table_data.items():
+                if k not in extracted:
+                    extracted[k] = v
+        # --- Strategy 2: Text/Regex Extraction from Dolphin output ---
+        full_text = "\n".join(text_content_parts)
+        if full_text:
+            text_data = DolphinExtractor._extract_from_text(full_text, DataMapper)
+            for k, v in text_data.items():
+                if k not in extracted:
+                    extracted[k] = v
+        # --- Strategy 3: Full Markdown extraction (catch-all) ---
+        if dolphin_result.full_markdown:
+            markdown_data = DolphinExtractor._extract_from_text(
+                dolphin_result.full_markdown, DataMapper
+            )
+            for k, v in markdown_data.items():
+                if k not in extracted:
+                    extracted[k] = v
+        logger.info(
+            f"Dolphin extracted {len(extracted)} fields from "
+            f"{len(tables_data)} tables and {len(text_content_parts)} text blocks"
+        )
+        return extracted
+    @staticmethod
+    def extract_company_name(dolphin_result) -> Optional[str]:
+        """
+        Attempt to extract company name from Dolphin's parsed output.
+        Looks for SEC filing patterns, document headers, and prominent text.
+        """
+        if not dolphin_result.pages:
+            return None
+        # Check first page(s) for company name patterns
+        for page in dolphin_result.pages[:2]:
+            markdown = page.markdown
+            if not markdown:
+                continue
+            # SEC Filing: "Exact name of registrant as specified in its charter"
+            registrant_match = re.search(
+                r"(?:exact\s+name\s+of\s+registrant|registrant)",
+                markdown,
+                re.IGNORECASE,
+            )
+            if registrant_match:
+                # Look for prominent text before this marker
+                lines = markdown[: registrant_match.start()].strip().split("\n")
+                for line in reversed(lines[-10:]):
+                    candidate = line.strip().strip("#").strip("*").strip()
+                    if (
+                        len(candidate) > 2
+                        and not _is_boilerplate(candidate)
+                        and any(c.isalpha() for c in candidate)
+                    ):
+                        return candidate[:100]
+            # Markdown heading on first page
+            heading_match = re.search(r"^#+\s+(.+)$", markdown, re.MULTILINE)
+            if heading_match:
+                candidate = heading_match.group(1).strip()
+                if len(candidate) > 2 and not _is_boilerplate(candidate):
+                    return candidate[:100]
+            # First non-trivial line
+            for line in markdown.split("\n")[:30]:
+                candidate = line.strip().strip("#").strip("*").strip()
+                if (
+                    len(candidate) > 3
+                    and not _is_boilerplate(candidate)
+                    and any(c.isalpha() for c in candidate)
+                ):
+                    return candidate[:100]
+        return None
+    @staticmethod
+    def extract_fiscal_year(dolphin_result) -> Optional[str]:
+        """Extract fiscal year/period from Dolphin output."""
+        if not dolphin_result.full_markdown:
+            return None
+        patterns = [
+            r"(?:YEAR|PERIOD|FISCAL\s+YEAR)\s+ENDED\s+([A-Z]+\s+\d{1,2},\s+\d{4})",
+            r"(?:for\s+the\s+year\s+ended)\s+([A-Z]+\s+\d{1,2},\s+\d{4})",
+            r"DECEMBER\s+31,\s+(\d{4})",
+            r"(\d{4})\s+(?:annual|fiscal)",
+        ]
+        text = dolphin_result.full_markdown[:5000]
+        for pattern in patterns:
+            match = re.search(pattern, text, re.IGNORECASE)
+            if match:
+                return match.group(1)
+        return None
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _parse_markdown_table(table_text: str) -> List[List[str]]:
+        """
+        Parse a Markdown-format table into a list of rows.
+        Handles:
+            | Header1 | Header2 |
+            |---------|---------|
+            | val1    | val2    |
+        """
+        rows = []
+        for line in table_text.strip().split("\n"):
+            line = line.strip()
+            if not line.startswith("|"):
+                continue
+            # Skip separator rows (|---|---|)
+            if all(re.match(r"^[\s\-:]+$", c) for c in line.split("|") if c.strip()):
+                continue
+            cells = [cell.strip() for cell in line.split("|")]
+            # Remove empty first/last from leading/trailing pipes
+            cells = [c for c in cells if c != ""]
+            if cells:
+                rows.append(cells)
+        return rows
+    @staticmethod
+    def _extract_from_table_rows(
+        rows: List[List[str]], data_mapper
+    ) -> Dict[str, float]:
+        """
+        Extract financial data from parsed table rows using DataMapper.
+        Assumes first column is label, remaining columns are values.
+        Picks the most recent year column if years are detected in headers.
+        """
+        if not rows:
+            return {}
+        data = {}
+        # Detect target value column (most recent year)
+        target_col = _find_target_column(rows)
+        # Detect scale multiplier from header text
+        multiplier = 1.0
+        header_text = " ".join(" ".join(r) for r in rows[:3]).lower()
+        if re.search(r"in millions|amounts in millions", header_text):
+            multiplier = 1_000_000.0
+        elif re.search(r"in thousands|amounts in thousands|\(in 000s\)", header_text):
+            multiplier = 1_000.0
+        for row in rows:
+            if len(row) < 2:
+                continue
+            label = row[0]
+            mapped_field = data_mapper.map_row(label)
+            if not mapped_field:
+                continue
+            # Get value from target column or first numeric column
+            val = None
+            if target_col is not None and target_col < len(row):
+                val = _clean_financial_value(row[target_col])
+            if val is None:
+                for cell in row[1:]:
+                    val = _clean_financial_value(cell)
+                    if val is not None:
+                        break
+            if val is not None:
+                data[mapped_field] = val * multiplier
+        return data
+    @staticmethod
+    def _extract_from_text(
+        text: str, data_mapper
+    ) -> Dict[str, float]:
+        """
+        Regex-based extraction from unstructured text.
+        Catches line items in formats like:
+            Revenue ............... $1,234,567
+            Net Income             (456,789)
+        """
+        data = {}
+        for field, aliases in data_mapper.FIELD_MAPPING.items():
+            if field in data:
+                continue
+            for alias in aliases:
+                pattern = re.compile(
+                    rf"{re.escape(alias)}[^0-9\-]*?(\(?[\d,]+\.?\d*\)?)",
+                    re.IGNORECASE,
+                )
+                match = pattern.search(text)
+                if match:
+                    val = _clean_financial_value(match.group(1))
+                    if val is not None:
+                        data[field] = val
+                        break
+        return data
+# ---------------------------------------------------------------------------
+# Module-level utility functions
+# ---------------------------------------------------------------------------
+def _find_target_column(rows: List[List[str]]) -> Optional[int]:
+    """Find the column index containing the most recent year."""
+    max_year = 0
+    target_col = None
+    for row in rows[:5]:  # Check headers
+        for idx, cell in enumerate(row):
+            cell_clean = cell.replace("$", "").strip()
+            if re.match(r"^\d{4}$", cell_clean):
+                year = int(cell_clean)
+                if 2000 < year < 2100 and year > max_year:
+                    max_year = year
+                    target_col = idx
+    return target_col
+def _clean_financial_value(val_str: Optional[str]) -> Optional[float]:
+    """Convert financial string formats to float."""
+    if not val_str:
+        return None
+    s = val_str.strip().replace("$", "").replace(",", "").replace(" ", "")
+    if not s:
+        return None
+    # Handle parentheses as negative: (123) → -123
+    if "(" in s and ")" in s:
+        s = s.replace("(", "-").replace(")", "")
+    # Handle em-dash or dash as zero
+    if s in ("-", "—", "–"):
+        return 0.0
+    try:
+        return float(s)
+    except ValueError:
+        return None
+_BOILERPLATE_PHRASES = {
+    "table of contents", "contents", "index", "financial statements",
+    "consolidated financial statements", "annual report", "quarterly report",
+    "10-k", "10-q", "form 10-k", "form 10-q", "united states",
+    "securities and exchange commission", "washington", "d.c.",
+    "commission file number", "transition report",
+}
+def _is_boilerplate(text: str) -> bool:
+    """Check if text is a common boilerplate heading."""
+    return text.strip().lower() in _BOILERPLATE_PHRASES or text.strip().isdigit()

app/services/ingestion/dolphin/remote_client.py ADDED Viewed

	@@ -0,0 +1,110 @@

+"""
+Remote Dolphin Client — Consumes the Dolphin-as-a-Service API.
+Sends PDF files to the external AI Worker (Hugging Face Space)
+and receives structured extraction results.
+"""
+import os
+import httpx
+import logging
+from typing import Optional, Dict, Any, List
+from dataclasses import asdict
+from app.core.config import settings
+from app.services.ingestion.dolphin.client import (
+    DolphinDocumentResult,
+    DolphinPageResult,
+    DolphinLayoutResult,
+    DolphinElement,
+)
+logger = logging.getLogger(__name__)
+class RemoteDolphinClient:
+    """
+    Client for the remote Dolphin AI worker service.
+    Usage:
+        client = RemoteDolphinClient(api_url="https://hf.space/...", api_key="...")
+        result = client.parse_document("report.pdf")
+    """
+    def __init__(
+        self,
+        api_url: Optional[str] = None,
+        api_key: Optional[str] = None,
+        timeout: int = 300,  # 5 minutes for large PDFs
+    ):
+        self.api_url = (api_url or settings.DOLPHIN_API_URL).rstrip("/")
+        self.api_key = api_key or settings.DOLPHIN_API_KEY
+        self.timeout = timeout
+        if not self.api_url:
+            raise ValueError("DOLPHIN_API_URL must be set for RemoteDolphinClient")
+        logger.info(f"Initialized RemoteDolphinClient pointing to {self.api_url}")
+    def parse_document(self, pdf_path: str) -> DolphinDocumentResult:
+        """
+        Send PDF to remote worker and reconstruct the result object.
+        """
+        if not os.path.exists(pdf_path):
+            logger.error(f"PDF not found: {pdf_path}")
+            return DolphinDocumentResult(total_pages=0)
+        url = f"{self.api_url}/process"
+        headers = {}
+        if self.api_key:
+            headers["Authorization"] = f"Bearer {self.api_key}"
+        try:
+            logger.info(f"Sending {pdf_path} to remote Dolphin worker...")
+            with open(pdf_path, "rb") as f:
+                files = {"file": (os.path.basename(pdf_path), f, "application/pdf")}
+                with httpx.Client(timeout=self.timeout) as client:
+                    response = client.post(url, files=files, headers=headers)
+                    response.raise_for_status()
+                    data = response.json()
+                    return self._reconstruct_result(data)
+        except httpx.HTTPStatusError as e:
+            logger.error(f"Remote Dolphin API error: {e.response.text}")
+            raise RuntimeError(f"Dolphin API failed: {e.response.status_code}") from e
+        except Exception as e:
+            logger.error(f"Remote Dolphin client failed: {e}")
+            raise
+    def _reconstruct_result(self, data: Dict[str, Any]) -> DolphinDocumentResult:
+        """Convert JSON response back to DolphinDocumentResult objects."""
+        pages = []
+        for p in data.get("pages", []):
+            elements = [
+                DolphinElement(**e) for e in p.get("elements", [])
+            ]
+            pages.append(DolphinPageResult(
+                page_number=p["page_number"],
+                markdown=p["markdown"],
+                structured_json=p.get("structured_json", {}),
+                elements=elements,
+            ))
+        layouts = []
+        for l in data.get("layouts", []):
+            layouts.append(DolphinLayoutResult(
+                page_number=l["page_number"],
+                sections=l.get("sections", []),
+                reading_order=l.get("reading_order", []),
+                doc_type_hint=l.get("doc_type_hint", "unknown"),
+            ))
+        return DolphinDocumentResult(
+            pages=pages,
+            layouts=layouts,
+            full_markdown=data.get("full_markdown", ""),
+            total_pages=data.get("total_pages", 0),
+        )

app/services/ingestion/keyword_learner.py ADDED Viewed

	@@ -0,0 +1,262 @@

+"""
+Keyword Learner — Self-improving classification from admin-uploaded reference documents.
+========================================================================================
+Extracts candidate keywords from known-good reference documents, letting admins
+review and approve them to grow the keyword registry over time.
+Supports batch training with up to 5 files at once.
+"""
+import re
+import logging
+import os
+from collections import Counter
+from typing import List, Dict, Optional, Tuple
+from .doc_keywords import (
+    DOC_TYPE_REGISTRY,
+    get_effective_keywords,
+    load_learned_keywords,
+    save_learned_keywords,
+)
+logger = logging.getLogger(__name__)
+# ============================================================================
+# CONFIGURATION
+# ============================================================================
+MAX_TRAINING_FILES = 5
+MIN_PHRASE_LENGTH = 3           # Minimum characters for a candidate keyword
+MAX_PHRASE_LENGTH = 80          # Maximum characters for a candidate keyword
+MIN_FREQUENCY = 1               # Minimum appearances across files to be a candidate
+COMMON_STOPWORDS = {
+    "the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for",
+    "of", "with", "by", "from", "as", "is", "was", "are", "were", "be",
+    "been", "being", "have", "has", "had", "do", "does", "did", "will",
+    "would", "could", "should", "may", "might", "shall", "can",
+    "this", "that", "these", "those", "it", "its", "they", "them",
+    "their", "he", "she", "we", "you", "i", "me", "my", "your",
+    "page", "date", "total", "amount", "number", "name",
+}
+def extract_candidate_keywords(
+    texts: List[str],
+    doc_type_key: str,
+) -> List[Dict]:
+    """
+    Extract candidate keywords from reference document texts.
+    Analyzes the texts for distinctive phrases that are NOT already
+    in the registry for the given doc type.
+    Args:
+        texts: List of extracted text strings (one per uploaded file)
+        doc_type_key: The doc type key (e.g. "rent_roll")
+    Returns:
+        List of candidate keyword dicts with:
+            - keyword: the candidate text
+            - frequency: how many files it appeared in
+            - confidence: estimated relevance score (0-1)
+    """
+    if doc_type_key not in DOC_TYPE_REGISTRY:
+        logger.warning(f"Unknown doc type key: {doc_type_key}")
+        return []
+    # Get existing keywords for this type (base + learned)
+    existing = set(kw.lower() for kw in get_effective_keywords(doc_type_key))
+    # Get ALL keywords across ALL types to find what's unique to this type
+    all_other_keywords = set()
+    for key, dt in DOC_TYPE_REGISTRY.items():
+        if key != doc_type_key:
+            for kw in get_effective_keywords(key):
+                all_other_keywords.add(kw.lower())
+    # Extract phrases from all texts
+    phrase_counter = Counter()
+    file_presence = Counter()  # Track in how many files each phrase appears
+    for text in texts:
+        if not text:
+            continue
+        found_in_this_file = set()
+        phrases = _extract_phrases(text)
+        for phrase in phrases:
+            phrase_lower = phrase.lower().strip()
+            if (
+                len(phrase_lower) >= MIN_PHRASE_LENGTH
+                and len(phrase_lower) <= MAX_PHRASE_LENGTH
+                and phrase_lower not in existing
+                and phrase_lower not in COMMON_STOPWORDS
+                and not phrase_lower.isdigit()
+            ):
+                phrase_counter[phrase_lower] += 1
+                if phrase_lower not in found_in_this_file:
+                    file_presence[phrase_lower] += 1
+                    found_in_this_file.add(phrase_lower)
+    # Score candidates
+    candidates = []
+    for phrase, count in phrase_counter.most_common(100):
+        files_in = file_presence.get(phrase, 0)
+        num_files = len(texts)
+        # Confidence scoring:
+        # - Higher if phrase appears in more files (consistent)
+        # - Higher if phrase is NOT in other doc types (distinctive)
+        # - Lower if it's very generic
+        # Confidence scoring logic
+        if num_files == 1:
+            # specialized scoring for single-file uploads (demo mode)
+            # base confidence for a unique word in 1 file is low
+            # frequency matters much more here
+            base_score = 0.4
+            # frequency bonus
+            freq_bonus = 0.0
+            if count >= 5: freq_bonus = 0.4    # 0.4 + 0.3 + 0.4 = 1.1 -> 1.0 (Green)
+            elif count >= 3: freq_bonus = 0.2  # 0.4 + 0.3 + 0.2 = 0.9 (Green)
+            elif count >= 2: freq_bonus = 0.1  # 0.4 + 0.3 + 0.1 = 0.8 (Green/Yellow border)
+            else: freq_bonus = 0.0             # 0.4 + 0.3 = 0.7 (Yellow)
+            confidence = base_score + uniqueness_bonus + freq_bonus
+        else:
+            # standard multi-file scoring
+            # consistency across files matters most
+            file_ratio = files_in / max(num_files, 1)
+            confidence = (file_ratio * 0.5) + uniqueness_bonus + (0.2 if count > 2 else 0.0)
+        confidence = min(confidence, 1.0)
+        if confidence >= 0.2:  # Only suggest if minimally confident
+            candidates.append({
+                "keyword": phrase,
+                "frequency": count,
+                "files_found_in": files_in,
+                "confidence": round(confidence, 2),
+                "is_unique_to_type": is_unique,
+            })
+    # Sort by confidence desc, then frequency desc
+    candidates.sort(key=lambda x: (-x["confidence"], -x["frequency"]))
+    return candidates[:50]  # Return top 50 candidates
+def approve_keywords(
+    doc_type_key: str,
+    keywords: List[str],
+) -> Dict:
+    """
+    Approve candidate keywords and persist them to the learned registry.
+    Args:
+        doc_type_key: The doc type key
+        keywords: List of keyword strings to approve
+    Returns:
+        Dict with status and counts
+    """
+    if doc_type_key not in DOC_TYPE_REGISTRY:
+        return {"error": f"Unknown doc type: {doc_type_key}", "added": 0}
+    learned = load_learned_keywords()
+    if doc_type_key not in learned:
+        learned[doc_type_key] = []
+    added = 0
+    for kw in keywords:
+        kw_clean = kw.strip()
+        if kw_clean and kw_clean not in learned[doc_type_key]:
+            learned[doc_type_key].append(kw_clean)
+            added += 1
+    save_learned_keywords(learned)
+    total_keywords = len(get_effective_keywords(doc_type_key))
+    logger.info(
+        f"Approved {added} new keywords for '{doc_type_key}'. "
+        f"Total effective keywords: {total_keywords}"
+    )
+    return {
+        "doc_type": doc_type_key,
+        "added": added,
+        "total_learned": len(learned.get(doc_type_key, [])),
+        "total_effective": total_keywords,
+    }
+def get_training_stats() -> Dict:
+    """
+    Get training statistics for the admin dashboard.
+    Returns:
+        Dict with per-type learned keyword counts and totals.
+    """
+    learned = load_learned_keywords()
+    stats = {
+        "total_learned_keywords": sum(len(v) for v in learned.values()),
+        "types_with_learned": len(learned),
+        "per_type": {},
+    }
+    for key, dt in DOC_TYPE_REGISTRY.items():
+        learned_count = len(learned.get(key, []))
+        if learned_count > 0:
+            stats["per_type"][key] = {
+                "display_name": dt.display_name,
+                "base_keywords": len(dt.keywords),
+                "learned_keywords": learned_count,
+                "total_keywords": len(dt.keywords) + learned_count,
+            }
+    return stats
+# ============================================================================
+# INTERNAL HELPERS
+# ============================================================================
+def _extract_phrases(text: str) -> List[str]:
+    """
+    Extract meaningful phrases from document text.
+    Looks for:
+    - Multi-word capitalized headers/labels (e.g., "TOTAL NET REVENUE")
+    - Key-value labels before colons (e.g., "Policy Number:")
+    - Table header-like strings
+    """
+    phrases = []
+    # 1. All-caps multi-word phrases (headers, labels)
+    caps_pattern = re.compile(r'\b([A-Z][A-Z\s&/\-\']{2,}[A-Z])\b')
+    for match in caps_pattern.finditer(text):
+        phrase = match.group(1).strip()
+        if len(phrase) >= MIN_PHRASE_LENGTH:
+            phrases.append(phrase)
+    # 2. Labels before colons
+    label_pattern = re.compile(r'([A-Za-z][\w\s&/\-]{2,})\s*:')
+    for match in label_pattern.finditer(text):
+        phrase = match.group(1).strip()
+        if len(phrase) >= MIN_PHRASE_LENGTH:
+            phrases.append(phrase)
+    # 3. Lines that look like section headers (Title Case at start of line)
+    for line in text.split('\n'):
+        line = line.strip()
+        if line and len(line) <= MAX_PHRASE_LENGTH:
+            # Title case or all caps, not a number
+            words = line.split()
+            if len(words) >= 2 and all(
+                w[0].isupper() or w in ('and', 'or', 'of', 'the', 'in', 'for', 'to', 'by', '&', '/', '-')
+                for w in words if w
+            ):
+                phrases.append(line)
+    return phrases

app/services/ingestion/learned_keywords.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {}

app/services/ingestion/mappings.py ADDED Viewed

	@@ -0,0 +1,389 @@

+"""
+Data Mapper - Field name normalization for financial data.
+Maps various field names from different file formats (CSV, PDF, XLSX)
+to standardized internal field names.
+"""
+from typing import Dict, List, Optional, Tuple
+class DataMapper:
+    """
+    Maps raw field names to standardized internal field names.
+    Usage:
+        field = DataMapper.map_row("Total Revenue")  # Returns "revenue"
+        field = DataMapper.map_row("Accounts Receivable")  # Returns "accounts_receivable"
+    """
+    FIELD_MAPPING: Dict[str, List[str]] = {
+        # =================================================================
+        # INCOME STATEMENT
+        # =================================================================
+        "revenue": [
+            "revenue", "sales", "gross sales", "total revenue", "net sales",
+            "total net sales", "net revenue", "total sales", "service revenue",
+            "product revenue", "subscription revenue", "recurring revenue",
+            "operating revenue", "revenues, net", "revenues"
+        ],
+        "cogs": [
+            "cogs", "cost of goods", "direct costs", "cost of sales",
+            "cost of revenue", "cost of goods sold", "cost of products sold",
+            "cost of services", "direct cost", "cost of merchandise"
+        ],
+        "marketing_expenses": [
+            "marketing", "ad spend", "advertising", "marketing expense",
+            "promotion", "marketing and advertising", "advertising expense",
+            "marketing costs", "promotional expense", "customer acquisition"
+        ],
+        "payroll_expenses": [
+            "payroll", "salaries", "wages", "employee costs", "personnel",
+            "labor", "compensation", "salaries and wages", "employee benefits",
+            "stock compensation", "share-based compensation", "labor cost",
+            "wages and salaries", "staff costs"
+        ],
+        "rent_expense": [
+            "rent", "lease", "occupancy", "facilities", "rent expense",
+            "lease expense", "occupancy costs", "facility costs"
+        ],
+        "other_operating_expenses": [
+            "other expense", "operating expense", "sga", "general and administrative",
+            "g&a", "selling, general", "pre-opening", "impairment",
+            "administrative expense", "operating expenses", "other operating",
+            "research and development", "r&d", "utilities", "insurance"
+        ],
+        "depreciation": [
+            "depreciation", "depreciation expense", "depreciation and amortization"
+        ],
+        "amortization": [
+            "amortization", "amortization expense"
+        ],
+        "interest_expense": [
+            "interest", "interest expense", "finance costs", "interest cost",
+            "interest and finance charges", "borrowing costs"
+        ],
+        "taxes": [
+            "tax", "income tax", "taxes", "provision for taxes", "income tax expense",
+            "tax expense", "provision for income taxes"
+        ],
+        # =================================================================
+        # BALANCE SHEET - ASSETS
+        # =================================================================
+        "cash": [
+            "cash", "bank", "cash and equivalents", "cash & equivalents",
+            "cash and cash equivalents", "cash on hand", "short-term investments",
+            "cash, cash equivalents"
+        ],
+        "accounts_receivable": [
+            "accounts receivable", "ar", "receivables", "trade receivables",
+            "net receivables", "receivables, net", "trade accounts receivable"
+        ],
+        "inventory": [
+            "inventory", "stock", "merchandise", "inventories",
+            "merchandise inventory", "raw materials"
+        ],
+        "prepaid_expenses": [
+            "prepaid", "prepaid expenses", "other current assets",
+            "prepaid and other", "prepaids"
+        ],
+        "property_plant_equipment": [
+            "ppe", "fixed assets", "property plant equipment", "equipment",
+            "property, plant and equipment", "property and equipment",
+            "net property", "fixed assets, net", "capital assets"
+        ],
+        "accumulated_depreciation": [
+            "accumulated depreciation", "acc depreciation", "less depreciation"
+        ],
+        "intangible_assets": [
+            "intangible assets", "goodwill", "soft assets", "intangibles",
+            "goodwill and intangibles"
+        ],
+        # =================================================================
+        # BALANCE SHEET - LIABILITIES
+        # =================================================================
+        "accounts_payable": [
+            "accounts payable", "ap", "payables", "trade payables",
+            "trade accounts payable"
+        ],
+        "accrued_liabilities": [
+            "accrued liabilities", "accrued expenses", "accruals",
+            "accrued and other"
+        ],
+        "short_term_debt": [
+            "short term debt", "current portion of debt", "notes payable",
+            "current debt", "short-term borrowings", "current portion of long-term debt"
+        ],
+        "long_term_debt": [
+            "long term debt", "term loan", "non-current liabilities",
+            "long-term borrowings", "bonds payable", "notes payable long-term"
+        ],
+        "deferred_revenue": [
+            "deferred revenue", "unearned revenue", "contract liabilities",
+            "deferred income"
+        ],
+        "total_equity": [
+            "equity", "retained earnings", "shareholders equity", "total equity",
+            "stockholders equity", "shareholders' equity", "stockholders' equity",
+            "total shareholders equity", "net worth", "owner equity"
+        ],
+        # =================================================================
+        # CASH FLOW STATEMENT
+        # =================================================================
+        "operating_cash_flow": [
+            "operating cash flow", "cfo", "cash from operations",
+            "cash flow from operating activities", "net cash from operating",
+            "cash generated by operating activities", "operating activities",
+            "net cash provided by operating", "cash flows from operating"
+        ],
+        "capex": [
+            "capex", "capital expenditure", "purchase of property",
+            "additions to property", "capital expenditures",
+            "purchases of property", "property additions"
+        ],
+        "investing_cash_flow": [
+            "investing cash flow", "cash from investing",
+            "cash flow from investing activities", "investing activities",
+            "net cash from investing", "cash flows from investing"
+        ],
+        "financing_cash_flow": [
+            "financing cash flow", "cash from financing",
+            "cash flow from financing activities", "financing activities",
+            "net cash from financing", "cash flows from financing"
+        ],
+        # =================================================================
+        # OPERATING METRICS
+        # =================================================================
+        "new_customers": ["new customers", "customer additions", "new users"],
+        "total_transactions": ["transactions", "orders", "total orders"],
+        "total_seats": ["seats", "licenses", "subscriptions"],
+        "active_members": ["members", "active count", "active users"],
+        "restaurant_margin": ["restaurant margin", "store margin"],
+        "effective_tax_rate": ["effective tax rate", "tax rate"],
+        "churn_rate": ["churn", "churn rate", "attrition", "cancellation rate"],
+        "cac": ["cac", "acquisition cost", "customer acquisition cost"],
+        "ltv": ["ltv", "lifetime value", "cltv", "customer lifetime value"],
+        # =================================================================
+        # DERIVED / SUMMARY ITEMS (often in Excel templates)
+        # =================================================================
+        "gross_profit": [
+            "gross profit", "gross margin", "gross income"
+        ],
+        "operating_income": [
+            "operating income", "operating profit", "ebit", "income from operations"
+        ],
+        "net_income": [
+            "net income", "net profit", "net earnings", "net income attributable"
+        ],
+        "ebitda": [
+            "ebitda", "earnings before interest"
+        ],
+        "total_assets": [
+            "total assets", "assets total"
+        ],
+        "total_liabilities": [
+            "total liabilities", "liabilities total"
+        ],
+        # =================================================================
+        # SPECIALIZED DOCUMENT TYPE FIELDS
+        # =================================================================
+        # ARR / MRR (SaaS)
+        "beginning_arr": [
+            "beginning arr", "opening arr", "start arr", "bop arr"
+        ],
+        "new_logo_arr": [
+            "new logo arr", "new logos", "new business arr", "new customer arr"
+        ],
+        "expansion_arr": [
+            "expansion arr", "upsell", "cross-sell", "expansion revenue"
+        ],
+        "contraction_arr": [
+            "contraction arr", "downgrade", "contraction", "downsell"
+        ],
+        "churn_arr": [
+            "churn arr", "churned arr", "lost arr", "cancellation arr"
+        ],
+        "ending_arr": [
+            "ending arr", "closing arr", "exit arr", "eop arr"
+        ],
+        # Deferred Revenue / ASC 606
+        "beginning_balance": [
+            "beginning balance", "opening balance", "beginning deferred revenue"
+        ],
+        "billings": [
+            "billings", "invoiced", "new contracts billed", "fees billed"
+        ],
+        "revenue_recognized": [
+            "revenue recognized", "earned revenue", "satisfaction of performance obligation"
+        ],
+        "ending_balance": [
+            "ending balance", "closing balance", "ending deferred revenue"
+        ],
+        # Real Estate / NOI
+        "rental_revenue": [
+            "rental revenue", "rental income", "gross potential rent"
+        ],
+        "vacancy_loss": [
+            "vacancy loss", "vacancy", "credit loss", "vacancy & credit loss"
+        ],
+        "noi": [
+            "net operating income", "noi", "income before debt service"
+        ],
+        "management_fees": [
+            "management fees", "property management", "management expense"
+        ],
+        # Manufacturing / COGM
+        "direct_materials": [
+            "direct materials", "raw materials consumed", "material costs"
+        ],
+        "direct_labor": [
+            "direct labor", "manufacturing labor", "touch labor"
+        ],
+        "factory_overhead": [
+            "factory overhead", "manufacturing overhead", "indirect costs", "burden"
+        ],
+        # Energy / Oil & Gas
+        "proved_reserves": [
+            "proved reserves", "1p reserves", "total proved"
+        ],
+        "pv10_value": [
+            "pv-10", "present value at 10%", "discounted future net cash flows"
+        ],
+        "working_interest": [
+            "working interest", "wi %", "decimal interest"
+        ],
+    }
+    # Exclusion rules: (field, [terms that should NOT trigger this field])
+    EXCLUSIONS: Dict[str, List[str]] = {
+        "revenue": ["cost", "marketable securities", "deferred"],
+        "total_equity": ["awards", "liability", "liabilities", "debt"],
+        "cash": ["non-cash", "noncash"],
+        "depreciation": ["accum", "accumulated"],
+    }
+    # Field categories for validation
+    INCOME_FIELDS = [
+        "revenue", "cogs", "marketing_expenses", "payroll_expenses", "rent_expense",
+        "other_operating_expenses", "depreciation", "amortization", "interest_expense", "taxes",
+        "gross_profit", "operating_income", "net_income", "ebitda"
+    ]
+    BALANCE_FIELDS = [
+        "cash", "accounts_receivable", "inventory", "prepaid_expenses",
+        "property_plant_equipment", "accumulated_depreciation", "intangible_assets",
+        "accounts_payable", "accrued_liabilities", "short_term_debt", "long_term_debt",
+        "deferred_revenue", "total_equity", "total_assets", "total_liabilities"
+    ]
+    CASH_FIELDS = [
+        "operating_cash_flow", "capex", "investing_cash_flow", "financing_cash_flow"
+    ]
+    @staticmethod
+    def map_row(row_label: str) -> Optional[str]:
+        """
+        Map a raw field label to a standardized field name.
+        Args:
+            row_label: The raw label from the source file
+        Returns:
+            Standardized field name, or None if no match found
+        """
+        if not row_label:
+            return None
+        label_clean = str(row_label).lower().strip().replace("_", " ")
+        # Direct match check first
+        for field, aliases in DataMapper.FIELD_MAPPING.items():
+            if label_clean == field:
+                return field
+        # Fuzzy / keyword matching with longest match wins
+        best_match_field = None
+        best_match_len = 0
+        for field, aliases in DataMapper.FIELD_MAPPING.items():
+            for alias in aliases:
+                if alias in label_clean:
+                    # Check exclusions
+                    if field in DataMapper.EXCLUSIONS:
+                        if any(excl in label_clean for excl in DataMapper.EXCLUSIONS[field]):
+                            continue
+                    # Longest alias match wins (more specific)
+                    if len(alias) > best_match_len:
+                        best_match_len = len(alias)
+                        best_match_field = field
+        return best_match_field
+    @staticmethod
+    def map_row_with_confidence(row_label: str) -> Tuple[Optional[str], float]:
+        """
+        Map a row label and return confidence score.
+        Returns:
+            Tuple of (field_name, confidence) where confidence is 0.0-1.0
+        """
+        if not row_label:
+            return None, 0.0
+        label_clean = str(row_label).lower().strip().replace("_", " ")
+        # Exact match = 1.0 confidence
+        for field, aliases in DataMapper.FIELD_MAPPING.items():
+            if label_clean == field:
+                return field, 1.0
+            for alias in aliases:
+                if label_clean == alias:
+                    return field, 1.0
+        # Partial match = proportional confidence
+        best_match_field = None
+        best_confidence = 0.0
+        for field, aliases in DataMapper.FIELD_MAPPING.items():
+            for alias in aliases:
+                if alias in label_clean:
+                    # Check exclusions
+                    if field in DataMapper.EXCLUSIONS:
+                        if any(excl in label_clean for excl in DataMapper.EXCLUSIONS[field]):
+                            continue
+                    # Confidence based on how much of the label is matched
+                    confidence = len(alias) / len(label_clean)
+                    if confidence > best_confidence:
+                        best_confidence = confidence
+                        best_match_field = field
+        return best_match_field, min(best_confidence, 0.95)  # Cap at 0.95 for non-exact
+    @staticmethod
+    def get_statement_type(field: str) -> Optional[str]:
+        """
+        Determine which financial statement a field belongs to.
+        Returns:
+            "income", "balance", "cash_flow", or None
+        """
+        if field in DataMapper.INCOME_FIELDS:
+            return "income"
+        elif field in DataMapper.BALANCE_FIELDS:
+            return "balance"
+        elif field in DataMapper.CASH_FIELDS:
+            return "cash_flow"
+        return None

app/services/ingestion/parser_csv.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import pandas as pd
+import re
+from typing import Dict, Any, Optional
+from app.schemas.financial import (
+    FinancialReport,
+    BalanceSheetStandard,
+    IncomeStatementStandard,
+    CashFlowStandard,
+    OperatingMetrics,
+    PeriodType,
+    Currency
+)
+from datetime import date
+from app.services.ingestion.mappings import DataMapper
+class CSVParser:
+    @staticmethod
+    def parse(file_path: str) -> FinancialReport:
+        df = pd.read_csv(file_path)
+        # Logic to handle different CSV structures
+        # Case 1: Transposed (Item, Value)
+        # Case 2: Standard (Columns are periods, Rows are Items) -> We take the most recent column
+        data_dict = {}
+        # Check if columns themselves are headers (Horizontal Format)
+        # We look for at least 3 matching fields in columns to confirm
+        matches = 0
+        for col in df.columns:
+            if DataMapper.map_row(str(col)):
+                matches += 1
+        if matches >= 3:
+            # Horizontal Format: Take the last row (most recent data)
+            # Assumption: columns are fields
+            last_row = df.iloc[-1]
+            for col in df.columns:
+                 field = DataMapper.map_row(str(col))
+                 if field:
+                     val_raw = last_row[col]
+                     # Clean value with signage normalization
+                     if isinstance(val_raw, str):
+                         s = val_raw.strip().replace("$", "").replace(",", "").replace(" ", "")
+                         if "(" in s and ")" in s:
+                             s = s.replace("(", "-").replace(")", "")
+                         if s in ("-", "—", ""):
+                             val = 0.0
+                         else:
+                             try: val = float(s)
+                             except: val = 0.0
+                     else:
+                         val = float(val_raw) if pd.notnull(val_raw) else 0.0
+                     data_dict[field] = val
+        # Fallback to Vertical (Key-Value) Format
+        elif len(df.columns) >= 2:
+             # Assume col 0 is label, col 1 is current period value
+             for _, row in df.iterrows():
+                 label = str(row[0])
+                 # Try col 1, if nan try col 2? For now strict col 1
+                 val_raw = row[1]
+                 # Clean value with signage normalization
+                 if isinstance(val_raw, str):
+                     s = val_raw.strip().replace("$", "").replace(",", "").replace(" ", "")
+                     # Handle (123) as negative (accounting convention)
+                     if "(" in s and ")" in s:
+                         s = s.replace("(", "-").replace(")", "")
+                     # Handle dash/em-dash as zero
+                     if s in ("-", "—", ""):
+                         val = 0.0
+                     else:
+                         try: val = float(s)
+                         except: val = 0.0
+                 else:
+                     val = float(val_raw) if pd.notnull(val_raw) else 0.0
+                 field = DataMapper.map_row(label)
+                 if field:
+                     data_dict[field] = val
+        def get(key, default=0.0):
+            return data_dict.get(key, default)
+        income = IncomeStatementStandard(
+            revenue=get("revenue"),
+            cogs=get("cogs"),
+            marketing_expenses=get("marketing_expenses"),
+            payroll_expenses=get("payroll_expenses"),
+            rent_expense=get("rent_expense"),
+            other_operating_expenses=get("other_operating_expenses"),
+            depreciation=get("depreciation"),
+            amortization=get("amortization"),
+            interest_expense=get("interest_expense"),
+            taxes=get("taxes")
+        )
+        balance = BalanceSheetStandard(
+            cash=get("cash"),
+            accounts_receivable=get("accounts_receivable"),
+            inventory=get("inventory"),
+            prepaid_expenses=get("prepaid_expenses"),
+            property_plant_equipment=get("property_plant_equipment"),
+            accumulated_depreciation=get("accumulated_depreciation"),
+            intangible_assets=get("intangible_assets"),
+            accounts_payable=get("accounts_payable"),
+            accrued_liabilities=get("accrued_liabilities"),
+            short_term_debt=get("short_term_debt"),
+            long_term_debt=get("long_term_debt"),
+            deferred_revenue=get("deferred_revenue"),
+            total_equity=get("total_equity")
+        )
+        cash_flow = CashFlowStandard(
+            operating_cash_flow=get("operating_cash_flow"),
+            capex=get("capex"),
+            investing_cash_flow=get("investing_cash_flow"),
+            financing_cash_flow=get("financing_cash_flow")
+        )
+        metrics = OperatingMetrics(
+            industry='general', # Default, could extract from metadata
+            new_customers=int(get("new_customers")) if get("new_customers") else None,
+            total_transactions=int(get("total_transactions")) if get("total_transactions") else None,
+            total_seats=int(get("total_seats")) if get("total_seats") else None
+        )
+        return FinancialReport(
+            company_name="Imported Company",
+            period_end=date.today(),
+            period_type=PeriodType.ANNUAL,
+            currency=Currency.USD,
+            income_statement=income,
+            balance_sheet=balance,
+            cash_flow=cash_flow,
+            metrics=metrics
+        )

app/services/ingestion/parser_dolphin.py ADDED Viewed

	@@ -0,0 +1,471 @@

+"""
+Hybrid PDF Parser — Engines-First Classification Pipeline.
+Both engines extract text simultaneously, then the combined text
+is classified against 53 document types using keyword scoring.
+  Stage 1: Both engines extract text (Dolphin + pdfplumber, parallel)
+  Stage 2: Combined text → keyword scan against 53 doc types (80% threshold)
+  Stage 3: Doc-type-aware targeted extraction (using field list)
+  Stage 4: Merge extractions (Dolphin priority, pdfplumber gap-fill)
+  Stage 5: Standardize & build FinancialReport
+"""
+import logging
+import re
+from typing import Dict, Any, Optional, List
+from datetime import date
+from app.schemas.financial import (
+    FinancialReport,
+    BalanceSheetStandard,
+    IncomeStatementStandard,
+    CashFlowStandard,
+    OperatingMetrics,
+    PeriodType,
+    Currency,
+)
+from app.services.ingestion.mappings import DataMapper
+logger = logging.getLogger(__name__)
+class HybridPDFParser:
+    """
+    Hybrid parser that combines Dolphin-v2 deep parsing with pdfplumber
+    gap-filling on every PDF for maximum extraction coverage.
+    Implements the same `parse(file_path) -> FinancialReport` interface
+    as the original PDFParser.
+    """
+    @staticmethod
+    def parse(file_path: str) -> FinancialReport:
+        """
+        Parse a PDF using the engines-first classification pipeline.
+        Stages:
+            1. Both engines extract text simultaneously
+            2. Combined text → classify against 53 doc types (80% threshold)
+            3. Type-aware targeted extraction
+            4. Merge extractions (Dolphin priority, pdfplumber gap-fill)
+            5. Build FinancialReport with classification metadata
+        Falls back to pdfplumber-only if Dolphin is unavailable.
+        """
+        dolphin_data = {}
+        pdfplumber_data = {}
+        classification = None
+        dolphin_company_name = None
+        dolphin_fiscal_year = None
+        dolphin_text = ""
+        doc_result = None
+        extraction_method = "pdfplumber"
+        # =================================================================
+        # Stage 1: Both engines extract text simultaneously
+        # =================================================================
+        logger.info("Stage 1: Extracting text from both engines")
+        # 1a. Dolphin text extraction
+        try:
+            from app.services.ingestion.dolphin import is_dolphin_available
+            if is_dolphin_available():
+                from app.services.ingestion.dolphin.client import DolphinClient
+                from app.services.ingestion.dolphin.extractor import DolphinExtractor
+                client = DolphinClient.create()
+                doc_result = client.parse_document(file_path)
+                dolphin_text = doc_result.full_markdown if doc_result.total_pages > 0 else ""
+                dolphin_company_name = DolphinExtractor.extract_company_name(doc_result) if doc_result.total_pages > 0 else None
+                dolphin_fiscal_year = DolphinExtractor.extract_fiscal_year(doc_result) if doc_result.total_pages > 0 else None
+                extraction_method = "dolphin_hybrid"
+                logger.info(f"Dolphin extracted {len(dolphin_text)} chars from {doc_result.total_pages} pages")
+            else:
+                logger.info("Dolphin not available — pdfplumber-only mode")
+        except Exception as e:
+            logger.warning(f"Dolphin text extraction failed: {e}")
+        # 1b. pdfplumber text extraction
+        pdfplumber_text = ""
+        try:
+            import pdfplumber
+            with pdfplumber.open(file_path) as pdf:
+                for page in pdf.pages:
+                    page_text = page.extract_text()
+                    if page_text:
+                        pdfplumber_text += page_text + "\n"
+            logger.info(f"pdfplumber extracted {len(pdfplumber_text)} chars")
+        except Exception as e:
+            logger.warning(f"pdfplumber text extraction failed: {e}")
+        # =================================================================
+        # Stage 2: Combined text → classify against 53 doc types
+        # =================================================================
+        combined_text = f"{dolphin_text}\n{pdfplumber_text}"
+        logger.info(f"Stage 2: Classifying {len(combined_text)} chars against 53 doc types")
+        from app.services.ingestion.dolphin.classifier import DocumentClassifier
+        # Collect Dolphin sections if available
+        all_sections = []
+        if doc_result and doc_result.total_pages > 0:
+            for layout in doc_result.layouts:
+                all_sections.extend(layout.sections)
+        classification = DocumentClassifier.classify(
+            text_content=combined_text,
+            dolphin_sections=all_sections,
+        )
+        logger.info(
+            f"Classified as '{classification.doc_type}' "
+            f"({classification.doc_type_display}) — "
+            f"{classification.match_percentage}% match, "
+            f"confidence={classification.confidence:.2f}, "
+            f"needs_review={classification.needs_review}"
+        )
+        # =================================================================
+        # Stage 3: Type-aware targeted extraction
+        # =================================================================
+        logger.info(f"Stage 3: Extracting data (type-aware: {classification.doc_type})")
+        # 3a. Dolphin structured extraction
+        if doc_result and doc_result.total_pages > 0:
+            try:
+                from app.services.ingestion.dolphin.extractor import DolphinExtractor
+                dolphin_data = DolphinExtractor.extract(doc_result, classification)
+            except Exception as e:
+                logger.warning(f"Dolphin extraction failed: {e}")
+        # 3b. pdfplumber targeted extraction
+        pdfplumber_data = HybridPDFParser._run_pdfplumber_extraction(
+            file_path, classification
+        )
+        # =================================================================
+        # Stage 4: Merge — Dolphin priority, pdfplumber gap-fill
+        # =================================================================
+        merged_data = HybridPDFParser._merge_extractions(dolphin_data, pdfplumber_data)
+        logger.info(
+            f"Stage 4: Merged {len(dolphin_data)} Dolphin + "
+            f"{len(pdfplumber_data)} pdfplumber → "
+            f"{len(merged_data)} total fields"
+        )
+        # =================================================================
+        # Stage 5: Build FinancialReport
+        # =================================================================
+        return HybridPDFParser._build_report(
+            extracted_data=merged_data,
+            text_content=combined_text,
+            file_path=file_path,
+            extraction_method=extraction_method,
+            classification=classification,
+            dolphin_company_name=dolphin_company_name,
+            dolphin_fiscal_year=dolphin_fiscal_year,
+        )
+    # ==================================================================
+    # Stage Implementations
+    # ==================================================================
+    @staticmethod
+    def _run_pdfplumber_extraction(file_path: str, classification=None):
+        """
+        pdfplumber targeted extraction — tables + regex.
+        Uses classification to guide which statement types to look for.
+        Reuses the proven logic from the existing PDFParser.
+        """
+        from app.services.ingestion.parser_pdf import PDFParser
+        from app.services.ingestion.dolphin.classifier import DocumentClassifier
+        import pdfplumber
+        extracted_data = {}
+        try:
+            with pdfplumber.open(file_path) as pdf:
+                # Determine which statement types to extract based on classification
+                if classification:
+                    target_types = DocumentClassifier.get_financial_statement_types(classification)
+                else:
+                    target_types = ["income", "balance", "cash_flow"]
+                # Statement page locator
+                statement_pages = PDFParser._find_statement_pages(pdf)
+                # Extract from identified statement pages
+                for stmt_type, page in statement_pages.items():
+                    if stmt_type not in target_types:
+                        continue  # Skip statement types not relevant to this doc
+                    allowed_fields = None
+                    if stmt_type == "income":
+                        allowed_fields = DataMapper.INCOME_FIELDS
+                    elif stmt_type == "balance":
+                        allowed_fields = DataMapper.BALANCE_FIELDS
+                    elif stmt_type == "cash_flow":
+                        allowed_fields = DataMapper.CASH_FIELDS
+                    table_data = PDFParser._extract_table_data(page, allowed_fields)
+                    extracted_data.update(table_data)
+                # Full text for regex fallback
+                text_content = ""
+                for page in pdf.pages:
+                    page_text = page.extract_text()
+                    if page_text:
+                        text_content += page_text + "\n"
+            # Regex fallback for missing fields
+            regex_data = PDFParser._extract_via_regex(
+                text_content, existing_keys=extracted_data.keys()
+            )
+            extracted_data.update(regex_data)
+        except Exception as e:
+            logger.warning(f"pdfplumber extraction failed: {e}")
+        return extracted_data
+    @staticmethod
+    def _merge_extractions(
+        dolphin_data: Dict[str, Any],
+        pdfplumber_data: Dict[str, Any],
+    ) -> Dict[str, Any]:
+        """
+        Merge Dolphin and pdfplumber extractions.
+        Priority: Dolphin fields take precedence.
+        pdfplumber fills any gaps not covered by Dolphin.
+        """
+        merged = dict(dolphin_data)  # Start with Dolphin data
+        for key, value in pdfplumber_data.items():
+            if key not in merged:
+                merged[key] = value
+            elif merged[key] == 0.0 and value != 0.0:
+                # If Dolphin gave 0 but pdfplumber found a value, prefer pdfplumber
+                merged[key] = value
+        return merged
+    # ==================================================================
+    # Report Construction (mirrors PDFParser logic)
+    # ==================================================================
+    @staticmethod
+    def _build_report(
+        extracted_data: Dict,
+        text_content: str,
+        file_path: str,
+        extraction_method: str,
+        classification=None,
+        dolphin_company_name: Optional[str] = None,
+        dolphin_fiscal_year: Optional[str] = None,
+    ) -> FinancialReport:
+        """Build a FinancialReport from merged extracted data."""
+        def get(key, default=0.0):
+            val = extracted_data.get(key)
+            return val if val is not None else default
+        # --- Income Statement ---
+        revenue = get("revenue")
+        cogs = get("cogs")
+        marketing = get("marketing_expenses")
+        payroll = get("payroll_expenses")
+        rent = get("rent_expense")
+        other = get("other_operating_expenses")
+        depreciation = get("depreciation")
+        amortization = get("amortization")
+        interest = get("interest_expense")
+        taxes = get("taxes")
+        op_expenses = marketing + payroll + rent + other
+        gross_profit = revenue - cogs
+        ebitda = gross_profit - op_expenses
+        op_income = ebitda - depreciation - amortization
+        net_income = op_income - interest - taxes
+        income = IncomeStatementStandard(
+            revenue=revenue, cogs=cogs,
+            marketing_expenses=marketing, payroll_expenses=payroll,
+            rent_expense=rent, other_operating_expenses=other,
+            depreciation=depreciation, amortization=amortization,
+            interest_expense=interest, taxes=taxes,
+            operating_expenses=op_expenses, gross_profit=gross_profit,
+            ebitda=ebitda, operating_income=op_income, net_income=net_income,
+        )
+        # --- Balance Sheet ---
+        cash = get("cash")
+        ar = get("accounts_receivable")
+        inv = get("inventory")
+        prepaid = get("prepaid_expenses")
+        ppe = get("property_plant_equipment")
+        accum_dep = get("accumulated_depreciation")
+        intangibles = get("intangible_assets")
+        ap = get("accounts_payable")
+        accrued = get("accrued_liabilities")
+        st_debt = get("short_term_debt")
+        lt_debt = get("long_term_debt")
+        deferred = get("deferred_revenue")
+        equity = get("total_equity")
+        bs_current_assets = cash + ar + inv + prepaid
+        bs_total_assets = bs_current_assets + (ppe - accum_dep) + intangibles
+        bs_current_liab = ap + accrued + st_debt
+        bs_total_liab = bs_current_liab + lt_debt + deferred
+        balance = BalanceSheetStandard(
+            cash=cash, accounts_receivable=ar, inventory=inv,
+            prepaid_expenses=prepaid, property_plant_equipment=ppe,
+            accumulated_depreciation=accum_dep, intangible_assets=intangibles,
+            accounts_payable=ap, accrued_liabilities=accrued,
+            short_term_debt=st_debt, long_term_debt=lt_debt,
+            deferred_revenue=deferred, total_equity=equity,
+            total_current_assets=bs_current_assets, total_assets=bs_total_assets,
+            total_current_liabilities=bs_current_liab, total_liabilities=bs_total_liab,
+        )
+        # --- Cash Flow ---
+        cash_flow = CashFlowStandard(
+            operating_cash_flow=get("operating_cash_flow"),
+            capex=get("capex"),
+            investing_cash_flow=get("investing_cash_flow"),
+            financing_cash_flow=get("financing_cash_flow"),
+            net_change_in_cash=get("net_change_in_cash"),
+        )
+        # --- Operating Metrics ---
+        metrics = OperatingMetrics(
+            industry="restaurant" if get("restaurant_margin") else "general",
+            new_customers=int(get("new_customers")) if get("new_customers") else None,
+            total_transactions=int(get("total_transactions")) if get("total_transactions") else None,
+            total_seats=int(get("total_seats")) if get("total_seats") else None,
+            churn_rate=get("churn_rate") if get("churn_rate") else None,
+            cac=get("cac") if get("cac") else None,
+            ltv=get("ltv") if get("ltv") else None,
+        )
+        # --- Metadata ---
+        metadata = {
+            "extraction_method": extraction_method,
+            "extracted_restaurant_margin": str(get("restaurant_margin")),
+            "extracted_effective_tax_rate": str(get("effective_tax_rate")),
+        }
+        if classification:
+            metadata["document_type"] = classification.doc_type
+            metadata["document_type_display"] = getattr(classification, 'doc_type_display', '')
+            metadata["classification_confidence"] = str(classification.confidence)
+            metadata["match_percentage"] = str(getattr(classification, 'match_percentage', 0.0))
+            metadata["needs_review"] = str(getattr(classification, 'needs_review', False))
+            metadata["matched_keywords"] = ",".join(getattr(classification, 'matched_keywords', [])[:20])
+            metadata["extractable_fields"] = ",".join(getattr(classification, 'extractable_fields', []))
+            metadata["detected_sections"] = ",".join(classification.detected_sections)
+            metadata["secondary_types"] = ",".join(getattr(classification, 'secondary_types', []))
+        # --- Company Name ---
+        company_name = HybridPDFParser._resolve_company_name(
+            dolphin_name=dolphin_company_name,
+            text_content=text_content,
+            file_path=file_path,
+        )
+        # --- Fiscal Year ---
+        fiscal_year_date = HybridPDFParser._resolve_fiscal_year(
+            dolphin_year=dolphin_fiscal_year,
+            text_content=text_content,
+        )
+        return FinancialReport(
+            company_name=company_name,
+            period_end=fiscal_year_date,
+            period_type=PeriodType.ANNUAL,
+            currency=Currency.USD,
+            income_statement=income,
+            balance_sheet=balance,
+            cash_flow=cash_flow,
+            metrics=metrics,
+            metadata=metadata,
+        )
+    # ==================================================================
+    # Name & Date Resolution
+    # ==================================================================
+    @staticmethod
+    def _resolve_company_name(
+        dolphin_name: Optional[str],
+        text_content: str,
+        file_path: str,
+    ) -> str:
+        """Resolve company name: Dolphin → text heuristics → filename."""
+        if dolphin_name:
+            return dolphin_name
+        # Reuse the existing PDFParser heuristics
+        from app.services.ingestion.parser_pdf import PDFParser
+        # We can't call PDFParser's name extraction directly (it's inline),
+        # so replicate the core logic:
+        lines = text_content.split("\n")
+        ignored = {
+            "TABLE OF CONTENTS", "CONTENTS", "INDEX", "FINANCIAL STATEMENTS",
+            "CONSOLIDATED FINANCIAL STATEMENTS", "ANNUAL REPORT", "QUARTERLY REPORT",
+            "10-K", "10-Q", "FORM 10-K", "FORM 10-Q", "UNITED STATES",
+            "SECURITIES AND EXCHANGE COMMISSION", "WASHINGTON", "D.C.",
+        }
+        # SEC filing heuristic
+        registrant_idx = -1
+        for i, line in enumerate(lines[:100]):
+            if "exact name of registrant" in line.lower():
+                registrant_idx = i
+                break
+        if registrant_idx > 0:
+            for j in range(registrant_idx - 1, -1, -1):
+                candidate = lines[j].strip()
+                if len(candidate) > 2 and not any(ig in candidate.upper() for ig in ignored):
+                    return candidate[:100]
+        # First meaningful line
+        for line in lines[:40]:
+            candidate = line.strip()
+            if (
+                len(candidate) > 2
+                and not any(ig in candidate.upper() for ig in ignored)
+                and not candidate.isdigit()
+                and any(c.isalpha() for c in candidate)
+            ):
+                return candidate[:100]
+        # Filename fallback
+        import os
+        basename = os.path.basename(file_path)
+        return os.path.splitext(basename)[0].replace("-", " ").replace("_", " ")
+    @staticmethod
+    def _resolve_fiscal_year(
+        dolphin_year: Optional[str],
+        text_content: str,
+    ) -> date:
+        """Resolve fiscal year: Dolphin → text patterns → today."""
+        # Try Dolphin result first
+        if dolphin_year:
+            year_match = re.search(r"\d{4}", dolphin_year)
+            if year_match:
+                y = int(year_match.group(0))
+                if 1990 <= y <= date.today().year + 1:
+                    return date(y, 12, 31)
+        # Reuse PDFParser's fiscal year extraction
+        from app.services.ingestion.parser_pdf import PDFParser
+        return PDFParser._extract_fiscal_year(text_content)

app/services/ingestion/parser_pdf.py ADDED Viewed

	@@ -0,0 +1,213 @@

+import pdfplumber
+import re
+from typing import Dict, Any, Optional, List
+from app.schemas.financial import (
+    FinancialReport,
+    BalanceSheetStandard,
+    IncomeStatementStandard,
+    CashFlowStandard,
+    OperatingMetrics,
+    PeriodType,
+    Currency
+)
+from datetime import date
+from app.services.ingestion.mappings import DataMapper
+class PDFParser:
+    @staticmethod
+    def parse(file_path: str) -> FinancialReport:
+        """
+        Delegates to HybridPDFParser to enable AI-enhanced extraction
+        with automatic fallback to standard pdfplumber logic.
+        """
+        # Lazy import to avoid circular dependency
+        from app.services.ingestion.dolphin.parser_dolphin import HybridPDFParser
+        return HybridPDFParser.parse(file_path)
+    @staticmethod
+    def _finalize_report(name, income, balance, cash, metrics, meta, period_end):
+        """Helper to construct the final object"""
+        return FinancialReport(
+            company_name=name,
+            period_end=period_end,
+            period_type=PeriodType.ANNUAL,
+            currency=Currency.USD,
+            income_statement=income,
+            balance_sheet=balance,
+            cash_flow=cash,
+            metrics=metrics,
+            metadata=meta
+        )
+    @staticmethod
+    def _extract_fiscal_year(text: str) -> date:
+        """Finds the fiscal year end date from the text."""
+        # Pattern 1: Year Ended December 31, 2024
+        # Pattern 2: Period Ended ...
+        patterns = [
+            r"(?:YEAR|PERIOD|FISCAL YEAR)\s+ENDED\s+([A-Z]+\s+\d{1,2},\s+\d{4})",
+            r"DECEMBER\s+31,\s+(\d{4})"
+        ]
+        current_year = date.today().year
+        found_years = []
+        for pat in patterns:
+            matches = re.findall(pat, text[:5000], re.IGNORECASE) # Search first 5000 chars
+            for m in matches:
+                if isinstance(m, tuple): m = m[0]
+                # Extract year digit
+                year_match = re.search(r"\d{4}", m)
+                if year_match:
+                    y = int(year_match.group(0))
+                    if 1990 <= y <= current_year + 1:
+                        found_years.append(y)
+        if found_years:
+            # Most frequent or max year? Usually max year in the header is the current report year.
+            best_year = max(found_years)
+            return date(best_year, 12, 31) # Default to Dec 31
+        return date.today()
+    @staticmethod
+    def _find_statement_pages(pdf) -> Dict[str, Any]:
+        """ Identifies pages containing specific financial statements. """
+        pages = {}
+        for page in pdf.pages:
+            text = (page.extract_text() or "").upper()
+            # Skip Table of Contents pages (unless they contain financial data like '$')
+            if ("TABLE OF CONTENTS" in text[:500] or "INDEX" in text[:200]) and "$" not in text[:2000]:
+                continue
+            # Expanded Keywords
+            # Income
+            if any(x in text for x in ["CONSOLIDATED STATEMENTS OF OPERATIONS", "CONSOLIDATED STATEMENTS OF INCOME", "CONSOLIDATED STATEMENTS OF EARNINGS", "DISSOLIDATED STATEMENTS OF LOSS", "STATEMENT OF INCOME", "STATEMENTS OF OPERATIONS"]):
+                if "income" not in pages: pages["income"] = page
+            # Balance
+            elif any(x in text for x in ["CONSOLIDATED BALANCE SHEETS", "CONSOLIDATED STATEMENTS OF FINANCIAL POSITION", "BALANCE SHEETS", "FINANCIAL POSITION"]):
+                if "balance" not in pages: pages["balance"] = page
+            # Cash Flow
+            elif any(x in text for x in ["CONSOLIDATED STATEMENTS OF CASH FLOWS", "CONSOLIDATED STATEMENT OF CASH FLOWS", "STATEMENTS OF CASH FLOWS", "CASH FLOWS"]):
+                 if "cash_flow" not in pages: pages["cash_flow"] = page
+        return pages
+    @staticmethod
+    def _extract_table_data(page, allowed_fields: Optional[List[str]] = None) -> Dict[str, float]:
+        """ Extracts key-value pairs from tables on the page with smart column selection. """
+        data = {}
+        tables = page.extract_tables()
+        for table in tables:
+            # 1. Identify "Current Year" Column
+            # Scan first 5 rows for years (e.g., 2024, 2023)
+            target_col_idx = -1
+            max_year = 0
+            headers = table[:5]
+            for row in headers:
+                for idx, cell in enumerate(row):
+                    if not cell: continue
+                    # Look for year pattern
+                    # Check for 4 digits that look like a recent year
+                    cleaned = cell.replace("$", "").strip()
+                    if re.match(r"^\d{4}$", cleaned):
+                        y = int(cleaned)
+                        if 2000 < y < 2100:
+                            if y > max_year:
+                                max_year = y
+                                target_col_idx = idx
+            # If no year found, default to finding first numeric column later
+            # 2. Header-based Scaling Detection
+            # Look for "(in thousands)", "(in millions)", "($ in millions)", etc.
+            multiplier = 1.0
+            # Scan top of page text (first 1000 chars) or table headers
+            header_text = (page.extract_text() or "")[:1000].lower()
+            if "in millions" in header_text or "in 000s" in header_text.replace(",", ""):
+                 # Distinct from "in thousands" - some 10ks say "in 000s" meaning thousands, but let's stick to standard text
+                 pass
+            if re.search(r"\(in millions\)|in millions, except|dollares en millones|amounts in millions|dollars in millions", header_text):
+                multiplier = 1000000.0
+            elif re.search(r"\(in thousands\)|in thousands, except|dollares en miles|amounts in thousands|dollars in thousands|\(in 000s\)", header_text):
+                multiplier = 1000.0
+            # Override if strict detected
+            print(f"Detected scale multiplier: {multiplier}")
+            for row in table:
+                if not row or not row[0]: continue
+                label = row[0]
+                mapped_field = DataMapper.map_row(label)
+                if mapped_field:
+                    if allowed_fields is not None and mapped_field not in allowed_fields:
+                        continue
+                    # Extract Value
+                    val = None
+                    if target_col_idx != -1 and target_col_idx < len(row):
+                         # TRUSTED COLUMN
+                         val = PDFParser._clean_value(row[target_col_idx])
+                    else:
+                        # FALLBACK: First numeric column
+                        for col_val in row[1:]:
+                            clean_val = PDFParser._clean_value(col_val)
+                            if clean_val is not None:
+                                val = clean_val
+                                break
+                    if val is not None:
+                        data[mapped_field] = val * multiplier
+        return data
+    @staticmethod
+    def _clean_value(val_str: Optional[str]) -> Optional[float]:
+        """ Converts financial string formats to float. Handles parentheses for negative. """
+        if not val_str:
+            return None
+        s = val_str.strip().replace("$", "").replace(",", "").replace(" ", "")
+        if not s:
+            return None
+        # Handle (123) as negative
+        if "(" in s and ")" in s:
+            s = s.replace("(", "-").replace(")", "")
+        # Handle - as 0 (accounting format sometimes uses - for 0)
+        if s == "-" or s == "—":
+             return 0.0
+        try:
+            return float(s)
+        except ValueError:
+            return None
+    @staticmethod
+    def _extract_via_regex(text_content: str, existing_keys: List[str]) -> Dict[str, float]:
+        """ Fallback extraction for items not found in tables. """
+        data = {}
+        # Iterate over all mappings, skip if already found
+        for field, aliases in DataMapper.FIELD_MAPPING.items():
+            if field in existing_keys:
+                continue
+            for k in aliases:
+                # Regex matches "Keyword $1,234.56" or "Keyword....... 1,234.56"
+                pattern = re.compile(rf"{k}[^0-9-]*?(\(?[\d,]+\.?\d*\)?)", re.IGNORECASE)
+                match = pattern.search(text_content)
+                if match:
+                    val = PDFParser._clean_value(match.group(1))
+                    if val is not None:
+                        data[field] = val
+                        break
+        return data

app/services/ingestion/parser_xlsx.py ADDED Viewed

	@@ -0,0 +1,312 @@

+"""
+XLSX Parser - Excel file parsing for financial data.
+Parses Excel workbooks containing financial statements, handling:
+- Multi-sheet detection (Income Statement, Balance Sheet, Cash Flow)
+- Single-sheet condensed format
+- Various column/row layouts
+"""
+import re
+from typing import Dict, Any, Optional, List
+from datetime import date
+try:
+    import openpyxl
+    from openpyxl import load_workbook
+    from openpyxl.worksheet.worksheet import Worksheet
+except ImportError:
+    openpyxl = None
+import pandas as pd
+from app.schemas.financial import (
+    FinancialReport,
+    BalanceSheetStandard,
+    IncomeStatementStandard,
+    CashFlowStandard,
+    OperatingMetrics,
+    PeriodType,
+    Currency
+)
+from app.services.ingestion.mappings import DataMapper
+class XLSXParser:
+    """Parser for Excel (.xlsx, .xls) financial files."""
+    # Keywords to identify sheet types
+    INCOME_KEYWORDS = ['income', 'p&l', 'profit', 'loss', 'revenue', 'earnings']
+    BALANCE_KEYWORDS = ['balance', 'assets', 'liabilities', 'position']
+    CASHFLOW_KEYWORDS = ['cash flow', 'cashflow', 'cash', 'liquidity']
+    @staticmethod
+    def parse(file_path: str) -> FinancialReport:
+        """
+        Parse an Excel file and return a standardized FinancialReport.
+        Handles both multi-sheet and single-sheet formats.
+        """
+        if openpyxl is None:
+            # Fallback to pandas-only parsing
+            return XLSXParser._parse_with_pandas(file_path)
+        try:
+            wb = load_workbook(file_path, data_only=True)
+            # Categorize sheets
+            income_sheet = None
+            balance_sheet = None
+            cashflow_sheet = None
+            for sheet_name in wb.sheetnames:
+                name_lower = sheet_name.lower()
+                if any(kw in name_lower for kw in XLSXParser.INCOME_KEYWORDS):
+                    income_sheet = wb[sheet_name]
+                elif any(kw in name_lower for kw in XLSXParser.BALANCE_KEYWORDS):
+                    balance_sheet = wb[sheet_name]
+                elif any(kw in name_lower for kw in XLSXParser.CASHFLOW_KEYWORDS):
+                    cashflow_sheet = wb[sheet_name]
+            # If no specialized sheets found, use first sheet for all
+            if not income_sheet and not balance_sheet and not cashflow_sheet:
+                default_sheet = wb.active
+                income_sheet = balance_sheet = cashflow_sheet = default_sheet
+            # Extract data from each sheet
+            data_dict = {}
+            if income_sheet:
+                data_dict.update(XLSXParser._extract_from_sheet(income_sheet))
+            if balance_sheet and balance_sheet != income_sheet:
+                data_dict.update(XLSXParser._extract_from_sheet(balance_sheet))
+            if cashflow_sheet and cashflow_sheet != income_sheet and cashflow_sheet != balance_sheet:
+                data_dict.update(XLSXParser._extract_from_sheet(cashflow_sheet))
+            # If still no data, try pandas fallback
+            if not data_dict:
+                return XLSXParser._parse_with_pandas(file_path)
+            # Extract company name from filename or first cell
+            company_name = XLSXParser._extract_company_name(wb)
+            return XLSXParser._build_report(data_dict, company_name)
+        except Exception as e:
+            # Fallback to pandas
+            print(f"openpyxl parse failed, falling back to pandas: {e}")
+            return XLSXParser._parse_with_pandas(file_path)
+    @staticmethod
+    def _extract_from_sheet(sheet: 'Worksheet') -> Dict[str, float]:
+        """Extract financial data from a worksheet."""
+        data = {}
+        # Try to find the data range
+        # Look for rows with label in first column and numeric value in subsequent columns
+        for row in sheet.iter_rows(min_row=1, max_row=min(200, sheet.max_row)):
+            if not row or not row[0].value:
+                continue
+            label = str(row[0].value).strip()
+            field = DataMapper.map_row(label)
+            if field:
+                # Find the first non-empty numeric value in this row
+                for cell in row[1:]:
+                    if cell.value is not None:
+                        try:
+                            val = XLSXParser._clean_value(cell.value)
+                            if val is not None:
+                                data[field] = val
+                                break
+                        except:
+                            continue
+        return data
+    @staticmethod
+    def _clean_value(val: Any) -> Optional[float]:
+        """Clean and convert a cell value to float."""
+        if val is None:
+            return None
+        if isinstance(val, (int, float)):
+            return float(val)
+        if isinstance(val, str):
+            # Remove currency symbols, commas, parentheses for negatives
+            cleaned = re.sub(r'[,$]', '', val.strip())
+            # Handle (1000) format for negatives
+            if cleaned.startswith('(') and cleaned.endswith(')'):
+                cleaned = '-' + cleaned[1:-1]
+            try:
+                return float(cleaned)
+            except ValueError:
+                return None
+        return None
+    @staticmethod
+    def _extract_company_name(wb) -> str:
+        """Try to extract company name from workbook."""
+        # Check first sheet, first few cells
+        sheet = wb.active
+        for row in sheet.iter_rows(min_row=1, max_row=5, max_col=3):
+            for cell in row:
+                if cell.value and isinstance(cell.value, str):
+                    val = cell.value.strip()
+                    # Skip common headers
+                    if len(val) > 3 and len(val) < 100:
+                        lower = val.lower()
+                        if not any(kw in lower for kw in ['balance', 'income', 'cash', 'statement', 'period', 'date', 'quarter', 'annual']):
+                            return val
+        return "Imported Company"
+    @staticmethod
+    def _parse_with_pandas(file_path: str) -> FinancialReport:
+        """Fallback parsing using pandas."""
+        try:
+            # Read all sheets
+            xl = pd.ExcelFile(file_path)
+            data_dict = {}
+            for sheet_name in xl.sheet_names:
+                df = pd.read_excel(xl, sheet_name=sheet_name)
+                if df.empty:
+                    continue
+                # Try vertical format (label in col 0, value in col 1+)
+                if len(df.columns) >= 2:
+                    for _, row in df.iterrows():
+                        label = str(row.iloc[0]) if pd.notna(row.iloc[0]) else ""
+                        field = DataMapper.map_row(label)
+                        if field:
+                            # Find first numeric value
+                            for val in row.iloc[1:]:
+                                if pd.notna(val):
+                                    try:
+                                        data_dict[field] = float(str(val).replace(',', '').replace('$', ''))
+                                        break
+                                    except:
+                                        continue
+            return XLSXParser._build_report(data_dict, "Imported Company")
+        except Exception as e:
+            print(f"Pandas XLSX parse failed: {e}")
+            return XLSXParser._build_empty_report()
+    @staticmethod
+    def _build_report(data_dict: Dict[str, float], company_name: str) -> FinancialReport:
+        """Build FinancialReport from extracted data."""
+        def get(key: str, default: float = 0.0) -> float:
+            return data_dict.get(key, default)
+        # Computed Income
+        revenue = get("revenue")
+        cogs = get("cogs")
+        marketing = get("marketing_expenses")
+        payroll = get("payroll_expenses")
+        rent = get("rent_expense")
+        other = get("other_operating_expenses")
+        depreciation = get("depreciation")
+        amortization = get("amortization")
+        interest = get("interest_expense")
+        taxes = get("taxes")
+        op_expenses = marketing + payroll + rent + other
+        gross_profit = revenue - cogs
+        ebitda = gross_profit - op_expenses
+        op_income = ebitda - depreciation - amortization
+        net_income = op_income - interest - taxes
+        income = IncomeStatementStandard(
+            revenue=revenue,
+            cogs=cogs,
+            marketing_expenses=marketing,
+            payroll_expenses=payroll,
+            rent_expense=rent,
+            other_operating_expenses=other,
+            depreciation=depreciation,
+            amortization=amortization,
+            interest_expense=interest,
+            taxes=taxes,
+            # Computed
+            operating_expenses=op_expenses,
+            gross_profit=gross_profit,
+            ebitda=ebitda,
+            operating_income=op_income,
+            net_income=net_income
+        )
+        # Computed Balance
+        cash = get("cash")
+        ar = get("accounts_receivable")
+        inv = get("inventory")
+        prepaid = get("prepaid_expenses")
+        ppe = get("property_plant_equipment")
+        accum_dep = get("accumulated_depreciation")
+        intangibles = get("intangible_assets")
+        ap = get("accounts_payable")
+        accrued = get("accrued_liabilities")
+        st_debt = get("short_term_debt")
+        lt_debt = get("long_term_debt")
+        deferred = get("deferred_revenue")
+        equity = get("total_equity")
+        bs_current_assets = cash + ar + inv + prepaid
+        bs_total_assets = bs_current_assets + (ppe - accum_dep) + intangibles
+        bs_current_liab = ap + accrued + st_debt
+        bs_total_liab = bs_current_liab + lt_debt + deferred
+        balance = BalanceSheetStandard(
+            cash=cash,
+            accounts_receivable=ar,
+            inventory=inv,
+            prepaid_expenses=prepaid,
+            property_plant_equipment=ppe,
+            accumulated_depreciation=accum_dep,
+            intangible_assets=intangibles,
+            accounts_payable=ap,
+            accrued_liabilities=accrued,
+            short_term_debt=st_debt,
+            long_term_debt=lt_debt,
+            deferred_revenue=deferred,
+            total_equity=equity,
+            # Computed
+            total_current_assets=bs_current_assets,
+            total_assets=bs_total_assets,
+            total_current_liabilities=bs_current_liab,
+            total_liabilities=bs_total_liab
+        )
+        cash_flow = CashFlowStandard(
+            operating_cash_flow=get("operating_cash_flow"),
+            capex=get("capex"),
+            investing_cash_flow=get("investing_cash_flow"),
+            financing_cash_flow=get("financing_cash_flow")
+        )
+        metrics = OperatingMetrics(
+            industry='general',
+            new_customers=int(get("new_customers")) if get("new_customers") else None,
+            total_transactions=int(get("total_transactions")) if get("total_transactions") else None,
+            total_seats=int(get("total_seats")) if get("total_seats") else None
+        )
+        return FinancialReport(
+            company_name=company_name,
+            period_end=date.today(),
+            period_type=PeriodType.ANNUAL,
+            currency=Currency.USD,
+            income_statement=income,
+            balance_sheet=balance,
+            cash_flow=cash_flow,
+            metrics=metrics
+        )
+    @staticmethod
+    def _build_empty_report() -> FinancialReport:
+        """Build an empty report as last resort."""
+        return XLSXParser._build_report({}, "Unknown Company")

app/services/ingestion/unified_parser.py ADDED Viewed

	@@ -0,0 +1,84 @@

+"""
+Unified Parser - Central coordinator for all file format parsing.
+This module provides a single entry point for parsing any supported
+financial document format (CSV, PDF, XLSX).
+"""
+from typing import Tuple
+from app.schemas.financial import FinancialReport
+class UnifiedParser:
+    """
+    Central parser that routes files to appropriate format-specific parsers.
+    Supported formats:
+    - CSV: Comma-separated values
+    - PDF: PDF documents (10-K, 10-Q, financial reports)
+    - XLSX/XLS: Excel workbooks
+    """
+    SUPPORTED_EXTENSIONS = {
+        'csv': 'csv',
+        'pdf': 'pdf',
+        'xlsx': 'xlsx',
+        'xls': 'xlsx',  # Route both to XLSX parser
+    }
+    @staticmethod
+    def get_format(filename: str) -> str:
+        """
+        Determine file format from filename.
+        Returns: 'csv', 'pdf', 'xlsx', or raises ValueError
+        """
+        ext = filename.lower().rsplit('.', 1)[-1] if '.' in filename else ''
+        if ext not in UnifiedParser.SUPPORTED_EXTENSIONS:
+            raise ValueError(f"Unsupported file format: .{ext}. Supported: .csv, .pdf, .xlsx, .xls")
+        return UnifiedParser.SUPPORTED_EXTENSIONS[ext]
+    @staticmethod
+    def parse(file_path: str, filename: str) -> FinancialReport:
+        """
+        Parse a financial document and return standardized FinancialReport.
+        Args:
+            file_path: Path to the saved file on disk
+            filename: Original filename (used for format detection)
+        Returns:
+            FinancialReport with standardized financial data
+        Raises:
+            ValueError: If file format is not supported
+        """
+        fmt = UnifiedParser.get_format(filename)
+        if fmt == 'csv':
+            from app.services.ingestion.parser_csv import CSVParser
+            return CSVParser.parse(file_path)
+        elif fmt == 'pdf':
+            from app.services.ingestion.parser_dolphin import HybridPDFParser
+            return HybridPDFParser.parse(file_path)
+        elif fmt == 'xlsx':
+            from app.services.ingestion.parser_xlsx import XLSXParser
+            return XLSXParser.parse(file_path)
+        else:
+            raise ValueError(f"No parser available for format: {fmt}")
+    @staticmethod
+    def is_supported(filename: str) -> bool:
+        """Check if a filename has a supported extension."""
+        ext = filename.lower().rsplit('.', 1)[-1] if '.' in filename else ''
+        return ext in UnifiedParser.SUPPORTED_EXTENSIONS
+    @staticmethod
+    def get_supported_extensions() -> list:
+        """Return list of supported file extensions."""
+        return list(UnifiedParser.SUPPORTED_EXTENSIONS.keys())

app/services/intelligence/ai_cfo.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from app.schemas.financial import StandardizedDataPackage
+import os
+class AICFOService:
+    @staticmethod
+    def generate_executive_summary(data: StandardizedDataPackage) -> str:
+        """
+        Generates a natural language executive summary using a generative AI model.
+        Currently scaffolds the prompt construction and mocks the response if no API key is present.
+        """
+        # 1. Construct Context
+        company = data.raw_data.company_name
+        revenue = data.raw_data.income_statement.revenue
+        margin = data.kpis.net_margin
+        score = data.risk_analysis.risk_score
+        prompt = f"""
+        You are an elite CFO advising the CEO of {company}.
+        Financial Snapshot:
+        - Annual Revenue: ${revenue:,.2f}
+        - Net Margin: {margin:.1f}%
+        - Overall Risk Score: {score}/100
+        - Top Pain Points: {', '.join([p for p in data.insights if 'Pain' in p])}
+        Write a 3-paragraph executive summary:
+        1. The Good: What is working well?
+        2. The Bad: What are the immediate risks?
+        3. The Ugly: What needs drastic change immediately?
+        Keep it punchy, professional, and actionable.
+        """
+        # 2. Call LLM (Placeholder for Gemini)
+        # api_key = os.getenv("GEMINI_API_KEY")
+        # if api_key:
+        #     return call_gemini(api_key, prompt)
+        # 3. Mock Response (Fallback)
+        return (
+            f"## Executive Summary for {company}\n\n"
+            "**The Good:**\n"
+            f"Your revenue is strong at ${revenue:,.0f}, demonstrating clear market demand. "
+            f"A net margin of {margin:.1f}% is respectable, indicating your core unit economics are sound. "
+            f"With a Health Score of {data.health_score.total_score}/100, the business foundation is stable.\n\n"
+            "**The Bad:**\n"
+            f"We detected some potential liquidity friction locally. Your burn rate suggests you might have constrained runway if sales dip. "
+            "Optimization of COGS could yield an immediate 2-3% bottom-line improvement.\n\n"
+            "**The Ugly:**\n"
+            "No catastrophic risks detected immediately, but reliance on a single revenue stream could be a blind spot. "
+            "I recommend diversifying customer acquisition channels immediately to safeguard against volatility."
+        )

app/services/intelligence/gemini_service.py ADDED Viewed

	@@ -0,0 +1,238 @@

+import os
+import requests
+import json
+from dotenv import load_dotenv
+from app.schemas.chat import ChatRequest, ChatResponse
+from app.schemas.financial import StandardizedDataPackage
+# Load .env file
+load_dotenv()
+class GeminiService:
+    API_KEY = os.getenv("GEMINI_API_KEY")
+    # Model fallback chain - try in order, fall back if quota exceeded
+    MODELS = [
+        "gemini-3-flash",        # Primary - fastest, newest
+        "gemini-2.5-flash",      # Fallback 1 - stable
+        "gemini-2.5-flash-lite", # Fallback 2 - lightweight
+        "gemini-2.0-flash",      # Fallback 3 - legacy stable
+    ]
+    # Track which models have hit quota in this session
+    _exhausted_models = set()
+    @classmethod
+    def _get_api_url(cls, model_name: str) -> str:
+        """Generate API URL for a specific model."""
+        return f"https://generativelanguage.googleapis.com/v1beta/models/{model_name}:generateContent?key={cls.API_KEY}"
+    @classmethod
+    def _reset_exhausted_models(cls):
+        """Reset exhausted models (call periodically or on new day)."""
+        cls._exhausted_models.clear()
+    @staticmethod
+    def _parse_error_response(status_code: int, response_text: str) -> str:
+        """
+        Parse API error responses and return clean, user-friendly messages.
+        Never expose raw JSON to users.
+        """
+        if status_code == 429:
+            return "AI service is temporarily busy. Please try again in a few moments."
+        elif status_code == 401 or status_code == 403:
+            return "AI service authentication failed. Please check your API key configuration."
+        elif status_code == 400:
+            return "Invalid request to AI service. Please try a simpler query."
+        elif status_code == 500:
+            return "AI service is experiencing issues. Please try again later."
+        elif status_code == 503:
+            return "AI service is temporarily unavailable. Please try again later."
+        else:
+            return f"AI service returned an unexpected error (Code: {status_code}). Please try again."
+    @classmethod
+    def _try_request(cls, payload: dict, timeout: int = 30) -> tuple[bool, str, str]:
+        """
+        Try to make a request using available models with automatic fallback.
+        Returns: (success: bool, response_text: str, model_used: str)
+        """
+        if not cls.API_KEY:
+            return False, "Gemini API Key is missing. Please configure GEMINI_API_KEY.", ""
+        headers = {"Content-Type": "application/json"}
+        last_error = ""
+        for model in cls.MODELS:
+            # Skip models that have hit their quota this session
+            if model in cls._exhausted_models:
+                continue
+            try:
+                api_url = cls._get_api_url(model)
+                response = requests.post(api_url, headers=headers, json=payload, timeout=timeout)
+                if response.status_code == 200:
+                    result = response.json()
+                    try:
+                        text = result['candidates'][0]['content']['parts'][0]['text']
+                        return True, text, model
+                    except (KeyError, IndexError):
+                        last_error = "AI generated empty response."
+                        continue
+                elif response.status_code == 429:
+                    # Model quota exceeded - mark as exhausted and try next
+                    cls._exhausted_models.add(model)
+                    print(f"Model {model} quota exceeded, trying next model...")
+                    last_error = "All AI models are currently at capacity."
+                    continue
+                else:
+                    # Other error - try next model
+                    last_error = cls._parse_error_response(response.status_code, response.text)
+                    continue
+            except requests.exceptions.Timeout:
+                last_error = "AI service timed out."
+                continue
+            except requests.exceptions.ConnectionError:
+                last_error = "Unable to connect to AI service."
+                continue
+            except Exception as e:
+                last_error = "An unexpected error occurred."
+                continue
+        # All models exhausted
+        return False, last_error, ""
+    @classmethod
+    def query(cls, request: ChatRequest, context_data: StandardizedDataPackage) -> ChatResponse:
+        if not cls.API_KEY:
+            return ChatResponse(response="Gemini API Key is missing. Please configure GEMINI_API_KEY in the backend.")
+        # Construct Prompt with Financial Context
+        system_prompt = f"""
+        You are Visique, an expert AI CFO. You are analyzing the financial data for {context_data.raw_data.company_name}.
+        Financial Context:
+        - Revenue: {context_data.raw_data.income_statement.revenue} {context_data.raw_data.currency}
+        - Net Income: {context_data.raw_data.income_statement.net_income}
+        - Cash Balance: {context_data.raw_data.balance_sheet.cash}
+        - Health Score: {context_data.health_score.total_score}/100
+        Key Insights:
+        {json.dumps(context_data.insights, indent=2)}
+        Optimization Insights (Heatmap/Dead Zones):
+        {json.dumps([z for z in context_data.optimization_insights.dead_zones] if context_data.optimization_insights else [], indent=2)}
+        User Question: {request.message}
+        Answer concisely as a CFO. If the user asks about "Dynamic Promos" or "Optimization", refer to the Dead Zones data.
+        """
+        payload = {
+            "contents": [{
+                "parts": [{"text": system_prompt}]
+            }]
+        }
+        success, response_text, model_used = cls._try_request(payload)
+        if success:
+            return ChatResponse(response=response_text)
+        else:
+            return ChatResponse(response=response_text)
+    @classmethod
+    def generate_content(cls, prompt: str) -> str:
+        """
+        Generic generator for internal services (like GeoService).
+        Uses automatic model fallback. Returns clean, presentable text.
+        """
+        if not cls.API_KEY:
+            return "Strategic insights require AI configuration. Contact support for assistance."
+        payload = {
+            "contents": [{
+                "parts": [{"text": prompt}]
+            }]
+        }
+        success, response_text, model_used = cls._try_request(payload)
+        if success:
+            return response_text
+        else:
+            # Return intelligent fallback content instead of error
+            return cls._get_fallback_content(prompt)
+    @staticmethod
+    def _get_fallback_content(prompt: str) -> str:
+        """
+        Provide meaningful fallback content when ALL AI models are unavailable.
+        This ensures reports and displays never show error messages.
+        """
+        prompt_lower = prompt.lower()
+        if "competitor" in prompt_lower or "landscape" in prompt_lower:
+            return """**Market Analysis**
+Based on industry standards for your sector:
+• **Primary Competition**: Focus on businesses within a 5-mile radius offering similar services
+• **Traffic Patterns**: Peak hours typically align with lunch (11am-2pm) and evening (5pm-8pm) periods
+• **Differentiation**: Evaluate unique value propositions against local alternatives
+*AI-powered real-time analysis available when capacity permits.*"""
+        elif "strategic" in prompt_lower or "context" in prompt_lower:
+            return """**Strategic Context Overview**
+Key considerations for your market:
+• **Regulatory Environment**: Stay current with local business regulations and licensing requirements
+• **Economic Indicators**: Monitor regional employment and consumer spending trends
+• **Industry Outlook**: Your sector shows stable fundamentals with growth potential
+*Enhanced AI insights will be available shortly.*"""
+        elif "marketing" in prompt_lower or "growth" in prompt_lower:
+            return """**Growth Strategy Framework**
+Recommended focus areas for sustainable growth:
+• **Digital Presence**: Optimize Google Business Profile and local SEO
+• **Customer Retention**: Implement loyalty programs to increase lifetime value
+• **Community Engagement**: Partner with local organizations for visibility
+*AI-powered personalized recommendations available when capacity permits.*"""
+        else:
+            return """**Analysis Summary**
+Your financial data has been processed successfully. Key takeaways:
+• Review the health score breakdown for areas of strength and improvement
+• Monitor cash runway projections for operational planning
+• Consider the recommendations provided for optimization opportunities
+*For deeper AI-driven insights, please try again in a few minutes.*"""
+    @classmethod
+    def get_model_status(cls) -> dict:
+        """
+        Get current status of available models (for debugging/admin).
+        """
+        available_models = [m for m in cls.MODELS if m not in cls._exhausted_models]
+        exhausted = list(cls._exhausted_models)
+        return {
+            "total_models": len(cls.MODELS),
+            "available_models": available_models,
+            "exhausted_models": exhausted,
+            "all_exhausted": len(available_models) == 0
+        }

app/services/intelligence/geo_service.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import random
+class GeoService:
+    @staticmethod
+    def analyze_location(address: str, industry: str = "General", is_own_company: bool = False, company_name: str = ""):
+        """
+        Generates strategic analysis using Google Gemini if available,
+        otherwise falls back to simulation.
+        :param address: The address to analyze
+        :param industry: The industry type
+        :param is_own_company: Whether this is the user's own company (enables more personalized insights)
+        :param company_name: Name of the company being analyzed
+        """
+        from app.services.intelligence.gemini_service import GeminiService
+        context_prefix = f"for {company_name}" if company_name else ""
+        personalization = "your business" if is_own_company else f"this {industry} business"
+        # Check for Real AI Capability
+        if GeminiService.API_KEY:
+            try:
+                # 1. Competitor Landscape
+                p1 = f"Analyze the competitor landscape {context_prefix} for a {industry} business located at {address}. {'As the owner, provide actionable competitive intelligence.' if is_own_company else 'Provide general market context.'} Identify 3 competitors and describe the traffic patterns in the area. Limit to 150 words. Format with **Bold** headers."
+                comp_summary = GeminiService.generate_content(p1)
+                # 2. Strategic Context
+                p2 = f"Provide a brief strategic context analysis for {address} regarding local regulations, news events, and economic sentiment for the {industry} sector {context_prefix}. {'Include specific recommendations for the owner.' if is_own_company else ''} Limit to 150 words."
+                context_summary = GeminiService.generate_content(p2)
+                # 3. Marketing Strategy
+                p3 = f"Suggest a growth and marketing strategy for {personalization} at {address}. {'Be specific with actionable next steps for the owner to implement.' if is_own_company else 'Provide general market positioning advice.'} Include digital positioning advice and 2 actionable recommendations. Limit to 150 words."
+                marketing_summary = GeminiService.generate_content(p3)
+                return {
+                    "competitor_analysis": comp_summary,
+                    "strategic_context": context_summary,
+                    "marketing_strategy": marketing_summary
+                }
+            except Exception as e:
+                print(f"Gemini Generation Failed: {e}. Falling back to simulation.")
+                # Fallthrough to default logic below
+        # ... FALLBACK MOCK DATA ...
+        # Mocking external data capabilities
+        competitors = [
+            "Alpha Competitor Inc.", "Beta Rivals LLC", "Local Market Leader"
+        ] if industry != "Restaurant" else [
+            "The Hungry Chef", "Burger King", "Downtown Bistro"
+        ]
+        ownership_note = "As the owner of this business," if is_own_company else "For this business,"
+        company_ref = company_name if company_name else "the business"
+        # 1. Competitor & Location Analysis (Page 1 content)
+        comp_summary = f"""
+        **Location Analysis for:** {address}
+        **Company:** {company_ref}
+        **Industry Focus:** {industry}
+        **Competitor Landscape:**
+        {ownership_note} we have identified {len(competitors)} primary competitors within a 5-mile radius:
+        {', '.join(competitors)}.
+        **Traffic Patterns:**
+        Based on historical data, the highest foot traffic in your area occurs between 11:00 AM and 2:00 PM on weekdays.
+        **Site Accessibility:**
+        Your location has a Walk Score of {random.randint(40, 95)}/100 and Transit Score of {random.randint(30, 80)}/100.
+        """
+        # 2. Political & Local News Context (Page 2 content)
+        context_summary = f"""
+        **Strategic Context: Local & Political Landscape**
+        **Regulatory Updates:**
+        Recent city council proceedings indicate a favorable shift for {industry} businesses.
+        **Economic Sentiment:**
+        Local consumer sentiment is currently 'Optimistic' with a spending index of {random.randint(90, 110)}.
+        {"**Owner Action Item:** Engage with local business association for networking opportunities." if is_own_company else ""}
+        """
+        # 3. Marketing & Growth Opportunities (Page 3 content)
+        marketing_summary = f"""
+        **Growth & Marketing Strategy for {company_ref}**
+        **Key Marketing Events:**
+        Leverage upcoming local opportunities like the Annual City Festival.
+        **Actionable Recommendations:**
+        1. **Hyper-Local SEO:** {"Optimize your" if is_own_company else "Optimize the"} Google Business Profile for '{company_ref}'.
+        2. **Community Partnerships:** Engage with local news events and neighborhood associations.
+        {"3. **Owner Priority:** Focus on building customer reviews - aim for 50+ 5-star reviews." if is_own_company else ""}
+        """
+        return {
+            "competitor_analysis": comp_summary,
+            "strategic_context": context_summary,
+            "marketing_strategy": marketing_summary
+        }