Happy People commited on
Commit
11d88a8
·
1 Parent(s): 30ae158

Deploying CPU-optimized Dolphin Worker from Visique

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. Dockerfile +27 -13
  2. app/__init__.py +98 -0
  3. app/api/admin.py +606 -0
  4. app/api/auth.py +273 -0
  5. app/api/endpoints.py +830 -0
  6. app/api/visilok.py +67 -0
  7. app/core/__init__.py +52 -0
  8. app/core/config.py +71 -0
  9. app/core/database.py +46 -0
  10. app/core/feature_registry.py +266 -0
  11. app/core/migrations.py +111 -0
  12. app/core/plan_config.py +192 -0
  13. app/core/security.py +28 -0
  14. app/core/stripe_config.py +29 -0
  15. app/main.py +127 -0
  16. app/models/feature_flags.py +59 -0
  17. app/models/user.py +63 -0
  18. app/schemas/chat.py +14 -0
  19. app/schemas/financial.py +47 -0
  20. app/schemas/user.py +82 -0
  21. app/services/__init__.py +36 -0
  22. app/services/analysis/__init__.py +54 -0
  23. app/services/analysis/engine_lite.py +48 -0
  24. app/services/analysis/factory.py +18 -0
  25. app/services/analysis/fundamental.py +75 -0
  26. app/services/analysis/growth.py +26 -0
  27. app/services/analysis/health_score.py +46 -0
  28. app/services/analysis/kpi.py +95 -0
  29. app/services/analysis/registry.py +65 -0
  30. app/services/analysis/risk.py +57 -0
  31. app/services/analysis/simulation.py +67 -0
  32. app/services/feature_service.py +306 -0
  33. app/services/ingestion/__init__.py +57 -0
  34. app/services/ingestion/doc_keywords.py +1408 -0
  35. app/services/ingestion/dolphin/__init__.py +158 -0
  36. app/services/ingestion/dolphin/classifier.py +278 -0
  37. app/services/ingestion/dolphin/client.py +393 -0
  38. app/services/ingestion/dolphin/extractor.py +336 -0
  39. app/services/ingestion/dolphin/remote_client.py +110 -0
  40. app/services/ingestion/keyword_learner.py +262 -0
  41. app/services/ingestion/learned_keywords.json +1 -0
  42. app/services/ingestion/mappings.py +389 -0
  43. app/services/ingestion/parser_csv.py +139 -0
  44. app/services/ingestion/parser_dolphin.py +471 -0
  45. app/services/ingestion/parser_pdf.py +213 -0
  46. app/services/ingestion/parser_xlsx.py +312 -0
  47. app/services/ingestion/unified_parser.py +84 -0
  48. app/services/intelligence/ai_cfo.py +52 -0
  49. app/services/intelligence/gemini_service.py +238 -0
  50. app/services/intelligence/geo_service.py +104 -0
Dockerfile CHANGED
@@ -1,23 +1,37 @@
1
- FROM nvidia/cuda:11.8.0-runtime-ubuntu22.04
2
 
3
- # System deps
4
- RUN apt-get update && apt-get install -y --no-install-recommends \
5
- python3 python3-pip poppler-utils git \
 
 
 
 
 
 
 
 
 
 
6
  && rm -rf /var/lib/apt/lists/*
7
 
8
- # Create non-root user (required by HF Spaces)
9
  RUN useradd -m -u 1000 user
10
  USER user
11
- ENV HOME=/home/user PATH="/home/user/.local/bin:$PATH"
12
-
13
  WORKDIR /home/user/app
14
 
15
- # Install Python deps first (layer caching)
16
- COPY --chown=user requirements.txt .
17
- RUN pip install --no-cache-dir --user -r requirements.txt
18
-
19
  # Copy application code
20
- COPY --chown=user . .
21
 
 
 
 
 
 
 
 
 
22
  EXPOSE 7860
23
- CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
 
 
 
1
+ FROM python:3.10-slim
2
 
3
+ # Set environment
4
+ ENV DEBIAN_FRONTEND=noninteractive
5
+ ENV PYTHONUNBUFFERED=1
6
+ ENV PATH="/home/user/.local/bin:$PATH"
7
+
8
+ # HF Spaces Free Tier: Force CPU-only mode
9
+ ENV DOLPHIN_DEVICE=cpu
10
+ ENV DOLPHIN_MAX_BATCH_SIZE=1
11
+
12
+ # Install system dependencies (poppler for pdf2image)
13
+ RUN apt-get update && apt-get install -y \
14
+ poppler-utils \
15
+ git \
16
  && rm -rf /var/lib/apt/lists/*
17
 
18
+ # Create user (Hugging Face Spaces runs as user 1000)
19
  RUN useradd -m -u 1000 user
20
  USER user
 
 
21
  WORKDIR /home/user/app
22
 
 
 
 
 
23
  # Copy application code
24
+ COPY --chown=user:user . .
25
 
26
+ # Install Dependencies
27
+ RUN pip3 install --no-cache-dir --upgrade pip && \
28
+ pip3 install --no-cache-dir -r requirements.txt
29
+
30
+ # Create models directory for cached model weights
31
+ RUN mkdir -p /home/user/app/models/dolphin-v2
32
+
33
+ # Expose port (HF Spaces defaults to 7860)
34
  EXPOSE 7860
35
+
36
+ # Start the worker directly (no ngrok, no bash wrapper)
37
+ CMD ["python3", "-m", "uvicorn", "worker:app", "--host", "0.0.0.0", "--port", "7860"]
app/__init__.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Visique Backend Application
3
+
4
+ This package contains the backend API and services for the Visique financial analysis platform.
5
+
6
+ ## Architecture Overview
7
+
8
+ ```
9
+ app/
10
+ ├── api/ # FastAPI route handlers
11
+ │ ├── admin.py # Admin console endpoints (users, reports, features)
12
+ │ ├── auth.py # Authentication (login, register, JWT)
13
+ │ └── endpoints.py # Analysis endpoints (upload, simulate, report)
14
+
15
+ ├── core/ # Core configuration and utilities
16
+ │ ├── config.py # Environment settings (API keys, URLs)
17
+ │ ├── database.py # SQLAlchemy database connection
18
+ │ ├── security.py # JWT token creation/validation
19
+ │ ├── feature_registry.py # Auto-discoverable feature definitions
20
+ │ └── plan_config.py # Plan limits and default features
21
+
22
+ ├── models/ # SQLAlchemy database models
23
+ │ ├── user.py # User, Analysis, Payment models
24
+ │ └── feature_flags.py # PlanFeatureOverride, PlanUploadLimit
25
+
26
+ ├── schemas/ # Pydantic request/response schemas
27
+ │ ├── user.py # UserCreate, UserResponse, etc.
28
+ │ ├── financial.py # StandardizedDataPackage, KPIs, etc.
29
+ │ └── chat.py # ChatRequest, ChatResponse
30
+
31
+ ├── services/ # Business logic layer
32
+ │ ├── feature_service.py # Feature flag resolution logic
33
+ │ ├── analysis/ # Financial analysis modules
34
+ │ │ ├── fundamental.py # Main analysis orchestrator
35
+ │ │ ├── kpi.py # KPI calculations
36
+ │ │ ├── risk.py # Risk analysis
37
+ │ │ ├── health_score.py # Health score computation
38
+ │ │ ├── growth.py # Growth metrics
39
+ │ │ └── simulation.py # What-if scenario modeling
40
+ │ ├── ingestion/ # Data parsing
41
+ │ │ ├── parser_csv.py # CSV file parsing
42
+ │ │ ├── parser_pdf.py # PDF extraction + OCR
43
+ │ │ └── mappings.py # Field name normalization
44
+ │ ├── intelligence/ # AI-powered features
45
+ │ │ ├── gemini_service.py # Gemini API integration
46
+ │ │ ├── ai_cfo.py # AI CFO chat functionality
47
+ │ │ ├── geo_service.py # Geo-strategic analysis
48
+ │ │ └── rag.py # RAG for document QA
49
+ │ └── reporting/ # Report generation
50
+ │ ├── pdf_report.py # PDF report builder
51
+ │ └── pptx_report.py # PowerPoint builder
52
+
53
+ └── main.py # FastAPI app initialization
54
+ ```
55
+
56
+ ## Module Responsibilities
57
+
58
+ ### API Layer (`api/`)
59
+ - HTTP request handling only
60
+ - Input validation via Pydantic
61
+ - Delegates all logic to services
62
+ - Returns standardized responses
63
+
64
+ ### Core Layer (`core/`)
65
+ - Application-wide configuration
66
+ - Feature registry (add new features here)
67
+ - Plan configuration (modify limits here)
68
+ - Security utilities (JWT)
69
+
70
+ ### Models Layer (`models/`)
71
+ - Database schema definitions
72
+ - Relationships between entities
73
+ - No business logic
74
+
75
+ ### Schemas Layer (`schemas/`)
76
+ - Request/response validation
77
+ - Data transformation for API
78
+ - Type hints for IDE support
79
+
80
+ ### Services Layer (`services/`)
81
+ - All business logic lives here
82
+ - Each subdirectory is a domain
83
+ - Services are stateless and testable
84
+
85
+ ## Adding New Features
86
+
87
+ 1. **New Feature Flag**: Add to `core/feature_registry.py`
88
+ 2. **New API Endpoint**: Add to appropriate `api/*.py`
89
+ 3. **New Service Logic**: Create in `services/` subdirectory
90
+ 4. **New Model Field**: Add to `models/` and run migration
91
+
92
+ ## Key Design Patterns
93
+
94
+ - **Repository Pattern**: Services interact with DB via session
95
+ - **Dependency Injection**: FastAPI `Depends()` for DB/auth
96
+ - **Single Responsibility**: Each module has one clear purpose
97
+ - **Feature Registry**: Auto-discoverable, category-organized
98
+ """
app/api/admin.py ADDED
@@ -0,0 +1,606 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, Depends, HTTPException, status
2
+ from sqlalchemy.orm import Session
3
+ from typing import List, Optional
4
+ from app.core.database import get_db
5
+ from app.models.user import User, Payment, Analysis
6
+ from app.schemas.user import UserResponse, PaymentResponse
7
+ from app.api.auth import get_current_user
8
+ import os
9
+
10
+ router = APIRouter(prefix="/admin", tags=["admin"])
11
+
12
+ def get_current_admin(current_user: User = Depends(get_current_user)):
13
+ if not current_user.is_admin:
14
+ raise HTTPException(
15
+ status_code=status.HTTP_403_FORBIDDEN,
16
+ detail="The user doesn't have enough privileges",
17
+ )
18
+ return current_user
19
+
20
+ @router.get("/payments", response_model=List[PaymentResponse])
21
+ def read_all_payments(
22
+ skip: int = 0,
23
+ limit: int = 100,
24
+ db: Session = Depends(get_db),
25
+ current_user: User = Depends(get_current_admin)
26
+ ):
27
+ payments = db.query(Payment).offset(skip).limit(limit).all()
28
+ return payments
29
+
30
+ @router.delete("/users/{user_id}", status_code=status.HTTP_204_NO_CONTENT)
31
+ def delete_user(
32
+ user_id: int,
33
+ db: Session = Depends(get_db),
34
+ current_user: User = Depends(get_current_admin)
35
+ ):
36
+ user = db.query(User).filter(User.id == user_id).first()
37
+ if not user:
38
+ raise HTTPException(status_code=404, detail="User not found")
39
+
40
+ if user.id == current_user.id:
41
+ raise HTTPException(status_code=400, detail="Cannot delete your own admin account")
42
+
43
+ db.delete(user)
44
+ db.commit()
45
+ return None
46
+
47
+ from pydantic import BaseModel
48
+ class AdminUserUpdate(BaseModel):
49
+ full_name: Optional[str] = None
50
+ company_name: Optional[str] = None
51
+ plan: Optional[str] = None
52
+ is_admin: Optional[bool] = None
53
+ is_super_admin: Optional[bool] = None
54
+ visique_id: Optional[str] = None
55
+ ein: Optional[str] = None
56
+ address: Optional[str] = None
57
+ industry: Optional[str] = None
58
+
59
+ class FeatureToggleRequest(BaseModel):
60
+ feature_states: dict # {feature_id: bool}
61
+
62
+ @router.put("/users/{user_id}", response_model=UserResponse)
63
+ def update_user_admin(
64
+ user_id: int,
65
+ user_update: AdminUserUpdate,
66
+ db: Session = Depends(get_db),
67
+ current_user: User = Depends(get_current_admin)
68
+ ):
69
+ user = db.query(User).filter(User.id == user_id).first()
70
+ if not user:
71
+ raise HTTPException(status_code=404, detail="User not found")
72
+
73
+ # Check if target is admin and requester is not super admin
74
+ if user.is_admin and not current_user.is_super_admin:
75
+ raise HTTPException(
76
+ status_code=403,
77
+ detail="Only Special Admins can edit Admin profiles"
78
+ )
79
+
80
+ update_data = user_update.dict(exclude_unset=True)
81
+ for key, value in update_data.items():
82
+ # Only super admins can change is_super_admin status
83
+ if key == "is_super_admin" and not current_user.is_super_admin:
84
+ continue
85
+ setattr(user, key, value)
86
+
87
+ db.commit()
88
+ db.refresh(user)
89
+ return user
90
+
91
+
92
+ @router.put("/users/{user_id}/features")
93
+ def update_user_features(
94
+ user_id: int,
95
+ request: FeatureToggleRequest,
96
+ db: Session = Depends(get_db),
97
+ current_user: User = Depends(get_current_admin)
98
+ ):
99
+ """
100
+ Update custom feature overrides for a specific user.
101
+ """
102
+ user = db.query(User).filter(User.id == user_id).first()
103
+ if not user:
104
+ raise HTTPException(status_code=404, detail="User not found")
105
+
106
+ # Get current and merge
107
+ current_features = user.custom_features or {}
108
+
109
+ # Handle SQLite parsing if needed
110
+ if isinstance(current_features, str):
111
+ import json
112
+ try:
113
+ current_features = json.loads(current_features)
114
+ except:
115
+ current_features = {}
116
+
117
+ # Ensure it's a dict copy to trigger mutation detection
118
+ new_features = dict(current_features)
119
+
120
+ for k, v in request.feature_states.items():
121
+ new_features[k] = v
122
+
123
+ user.custom_features = new_features
124
+
125
+ from sqlalchemy.orm.attributes import flag_modified
126
+ flag_modified(user, "custom_features")
127
+
128
+ db.commit()
129
+ return {
130
+ "status": "success",
131
+ "user_id": user.id,
132
+ "custom_features": user.custom_features
133
+ }
134
+
135
+
136
+ class EngineUpdateRequest(BaseModel):
137
+ engine: str
138
+
139
+ @router.put("/users/{user_id}/engine")
140
+ def update_user_engine(
141
+ user_id: int,
142
+ request: EngineUpdateRequest,
143
+ db: Session = Depends(get_db),
144
+ current_user: User = Depends(get_current_admin)
145
+ ):
146
+ """
147
+ Update a user's preferred engine (v1 or v2).
148
+ """
149
+ user = db.query(User).filter(User.id == user_id).first()
150
+ if not user:
151
+ raise HTTPException(status_code=404, detail="User not found")
152
+
153
+ if request.engine not in ["v1", "v2"]:
154
+ raise HTTPException(status_code=400, detail="Invalid engine. Must be 'v1' or 'v2'")
155
+
156
+ user.preferred_engine = request.engine
157
+ db.commit()
158
+ db.refresh(user)
159
+
160
+ return {"status": "success", "user_id": user.id, "preferred_engine": user.preferred_engine}
161
+
162
+ @router.get("/users", response_model=List[UserResponse])
163
+ def read_all_users(
164
+ skip: int = 0,
165
+ limit: int = 100,
166
+ search: Optional[str] = None,
167
+ db: Session = Depends(get_db),
168
+ current_user: User = Depends(get_current_admin)
169
+ ):
170
+ query = db.query(User)
171
+ if search:
172
+ # Search by Visique ID (exact or partial) or Email or Name
173
+ search_filter = f"%{search}%"
174
+ query = query.filter(
175
+ (User.email.ilike(search_filter)) |
176
+ (User.full_name.ilike(search_filter)) |
177
+ (User.visique_id.ilike(search_filter))
178
+ )
179
+ return query.offset(skip).limit(limit).all()
180
+
181
+ @router.get("/analyses")
182
+ def read_all_analyses(
183
+ skip: int = 0,
184
+ limit: int = 100,
185
+ db: Session = Depends(get_db),
186
+ current_user: User = Depends(get_current_admin)
187
+ ):
188
+ """
189
+ Get all analyses from all users.
190
+ Returns a simplified list for the admin dashboard.
191
+ """
192
+ # Join with User to get owner details
193
+ analyses = db.query(Analysis).join(User).order_by(Analysis.timestamp.desc()).offset(skip).limit(limit).all()
194
+
195
+ result = []
196
+ for a in analyses:
197
+ result.append({
198
+ "id": a.id,
199
+ "company_name": a.company_name,
200
+ "filename": a.input_filename,
201
+ "timestamp": a.timestamp,
202
+ "owner_email": a.owner.email,
203
+ "owner_visique_id": a.owner.visique_id
204
+ })
205
+ return result
206
+
207
+ @router.delete("/analyses/{analysis_id}", status_code=status.HTTP_204_NO_CONTENT)
208
+ def delete_analysis_admin(
209
+ analysis_id: int,
210
+ db: Session = Depends(get_db),
211
+ current_user: User = Depends(get_current_admin)
212
+ ):
213
+ analysis = db.query(Analysis).filter(Analysis.id == analysis_id).first()
214
+ if not analysis:
215
+ raise HTTPException(status_code=404, detail="Analysis not found")
216
+
217
+ # Delete file from disk
218
+ if analysis.stored_filename and os.path.exists(analysis.stored_filename):
219
+ try:
220
+ os.remove(analysis.stored_filename)
221
+ except OSError:
222
+ pass # Continue even if file delete fails
223
+
224
+ db.delete(analysis)
225
+ db.commit()
226
+ return None
227
+
228
+
229
+ # =============================================================================
230
+ # USAGE TRACKING ENDPOINTS
231
+ # =============================================================================
232
+
233
+ @router.get("/usage")
234
+ def get_usage_stats(
235
+ db: Session = Depends(get_db),
236
+ current_user: User = Depends(get_current_admin)
237
+ ):
238
+ """
239
+ Get upload usage statistics for all users.
240
+ Shows uploads used, limit, and percentage for admin dashboard.
241
+ """
242
+ from app.services.feature_service import get_effective_upload_limit
243
+
244
+ users = db.query(User).all()
245
+ result = []
246
+
247
+ for user in users:
248
+ plan = user.plan or "Individual"
249
+ if user.is_admin:
250
+ plan = "Admin"
251
+
252
+ limit = get_effective_upload_limit(db, plan)
253
+ used = user.monthly_upload_count or 0
254
+ percentage = round((used / limit * 100), 1) if limit > 0 else 0
255
+
256
+ result.append({
257
+ "id": user.id,
258
+ "email": user.email,
259
+ "full_name": user.full_name,
260
+ "visique_id": user.visique_id,
261
+ "plan": plan,
262
+ "uploads_used": used,
263
+ "uploads_limit": limit,
264
+ "usage_percentage": percentage,
265
+ "reset_date": user.upload_reset_date.isoformat() if user.upload_reset_date else None
266
+ })
267
+
268
+ # Sort by usage percentage descending
269
+ result.sort(key=lambda x: x["usage_percentage"], reverse=True)
270
+ return result
271
+
272
+
273
+ # =============================================================================
274
+ # FEATURE FLAG ENDPOINTS
275
+ # =============================================================================
276
+
277
+ @router.get("/features")
278
+ def get_feature_matrix(
279
+ db: Session = Depends(get_db),
280
+ current_user: User = Depends(get_current_admin)
281
+ ):
282
+ """
283
+ Get the full feature matrix for admin console.
284
+ Shows all features grouped by category with per-plan toggles.
285
+ """
286
+ from app.services.feature_service import get_feature_matrix as get_matrix
287
+ return get_matrix(db)
288
+
289
+
290
+ @router.get("/features/registry")
291
+ def get_feature_registry(
292
+ current_user: User = Depends(get_current_admin)
293
+ ):
294
+ """
295
+ Get the feature registry - all available features.
296
+ Useful for understanding what features can be controlled.
297
+ """
298
+ from app.core.feature_registry import get_features_by_category, get_all_feature_ids
299
+
300
+ categories = get_features_by_category()
301
+ result = {}
302
+
303
+ for cat_name, features in categories.items():
304
+ result[cat_name] = [
305
+ {
306
+ "id": f.id,
307
+ "name": f.name,
308
+ "description": f.description,
309
+ "default_enabled": f.default_enabled
310
+ }
311
+ for f in features
312
+ ]
313
+
314
+ return {
315
+ "total_features": len(get_all_feature_ids()),
316
+ "categories": result
317
+ }
318
+
319
+
320
+ @router.get("/features/{plan_name}")
321
+ def get_plan_features(
322
+ plan_name: str,
323
+ db: Session = Depends(get_db),
324
+ current_user: User = Depends(get_current_admin)
325
+ ):
326
+ """
327
+ Get enabled features for a specific plan.
328
+ """
329
+ from app.services.feature_service import get_effective_features, get_effective_upload_limit
330
+ from app.core.plan_config import get_all_plans, get_all_engines
331
+
332
+ if plan_name not in get_all_plans() and plan_name not in get_all_engines():
333
+ raise HTTPException(status_code=404, detail=f"Unknown plan: {plan_name}")
334
+
335
+ return {
336
+ "plan": plan_name,
337
+ "upload_limit": get_effective_upload_limit(db, plan_name),
338
+ "enabled_features": get_effective_features(db, plan_name)
339
+ }
340
+
341
+
342
+ @router.put("/features/{plan_name}")
343
+ def update_plan_features(
344
+ plan_name: str,
345
+ request: FeatureToggleRequest,
346
+ db: Session = Depends(get_db),
347
+ current_user: User = Depends(get_current_admin)
348
+ ):
349
+ """
350
+ Bulk update features for a plan.
351
+ """
352
+ from app.services.feature_service import bulk_set_features
353
+ from app.core.plan_config import get_all_plans, get_all_engines
354
+
355
+ if plan_name not in get_all_plans() and plan_name not in get_all_engines():
356
+ raise HTTPException(status_code=404, detail=f"Unknown plan: {plan_name}")
357
+
358
+ count = bulk_set_features(db, plan_name, request.feature_states, current_user.id)
359
+
360
+ return {
361
+ "message": f"Updated {count} features for {plan_name}",
362
+ "plan": plan_name,
363
+ "updated_count": count
364
+ }
365
+
366
+
367
+ @router.post("/features/{plan_name}/reset")
368
+ def reset_plan_features(
369
+ plan_name: str,
370
+ db: Session = Depends(get_db),
371
+ current_user: User = Depends(get_current_admin)
372
+ ):
373
+ """
374
+ Reset a plan's features to defaults (removes all overrides).
375
+ """
376
+ from app.services.feature_service import reset_plan_to_defaults
377
+ from app.core.plan_config import get_all_plans, get_all_engines
378
+
379
+ if plan_name not in get_all_plans() and plan_name not in get_all_engines():
380
+ raise HTTPException(status_code=404, detail=f"Unknown plan: {plan_name}")
381
+
382
+ count = reset_plan_to_defaults(db, plan_name)
383
+
384
+ return {
385
+ "message": f"Reset {plan_name} to defaults, removed {count} overrides",
386
+ "plan": plan_name,
387
+ "removed_overrides": count
388
+ }
389
+
390
+
391
+ class UploadLimitRequest(BaseModel):
392
+ upload_limit: int
393
+
394
+
395
+ @router.put("/features/{plan_name}/limit")
396
+ def update_plan_upload_limit(
397
+ plan_name: str,
398
+ request: UploadLimitRequest,
399
+ db: Session = Depends(get_db),
400
+ current_user: User = Depends(get_current_admin)
401
+ ):
402
+ """
403
+ Update upload limit for a plan.
404
+ """
405
+ from app.models.feature_flags import PlanUploadLimit
406
+ from app.core.plan_config import get_all_plans
407
+
408
+ if plan_name not in get_all_plans():
409
+ raise HTTPException(status_code=404, detail=f"Unknown plan: {plan_name}")
410
+
411
+ # Find or create limit override
412
+ override = db.query(PlanUploadLimit).filter(
413
+ PlanUploadLimit.plan_name == plan_name
414
+ ).first()
415
+
416
+ if override:
417
+ override.upload_limit = request.upload_limit
418
+ override.updated_by_id = current_user.id
419
+ else:
420
+ override = PlanUploadLimit(
421
+ plan_name=plan_name,
422
+ upload_limit=request.upload_limit,
423
+ updated_by_id=current_user.id
424
+ )
425
+ db.add(override)
426
+
427
+ db.commit()
428
+
429
+ return {
430
+ "message": f"Updated upload limit for {plan_name}",
431
+ "plan": plan_name,
432
+ "new_limit": request.upload_limit
433
+ }
434
+
435
+
436
+ # =============================================================================
437
+ # CLASSIFIER TRAINING ENDPOINTS
438
+ # =============================================================================
439
+
440
+ from fastapi import UploadFile, File, Form
441
+
442
+ @router.get("/classifier/doc-types")
443
+ def get_classifier_doc_types(
444
+ current_user: User = Depends(get_current_admin)
445
+ ):
446
+ """
447
+ Get all 53 document types for the training UI dropdown.
448
+ Returns summary list with id, key, display_name, category, keyword_count.
449
+ """
450
+ from app.services.ingestion.doc_keywords import get_all_doc_types_summary
451
+ return get_all_doc_types_summary()
452
+
453
+
454
+ @router.post("/classifier/train")
455
+ async def train_classifier(
456
+ doc_type: str = Form(...),
457
+ files: List[UploadFile] = File(...),
458
+ current_user: User = Depends(get_current_admin)
459
+ ):
460
+ """
461
+ Upload up to 5 reference PDFs for a specific doc type.
462
+ Extracts candidate keywords that admins can review and approve.
463
+ Uses thread pool for PDF processing to prevent blocking.
464
+ """
465
+ from app.services.ingestion.keyword_learner import (
466
+ extract_candidate_keywords, MAX_TRAINING_FILES
467
+ )
468
+ from app.services.ingestion.doc_keywords import DOC_TYPE_REGISTRY
469
+ from pypdf import PdfReader
470
+ import tempfile
471
+ import shutil
472
+ import time
473
+ from fastapi.concurrency import run_in_threadpool
474
+
475
+ request_start = time.time()
476
+ print(f"[Train] Started for doc_type={doc_type}, files={len(files)}")
477
+
478
+ # Validate doc type
479
+ if doc_type not in DOC_TYPE_REGISTRY:
480
+ raise HTTPException(
481
+ status_code=400,
482
+ detail=f"Unknown doc type: {doc_type}. Use GET /admin/classifier/doc-types for valid types."
483
+ )
484
+
485
+ # Validate file count
486
+ if len(files) > MAX_TRAINING_FILES:
487
+ raise HTTPException(
488
+ status_code=400,
489
+ detail=f"Maximum {MAX_TRAINING_FILES} files allowed per training batch."
490
+ )
491
+
492
+ # Helper function to run in threadpool
493
+ def extract_pdf_text(path: str) -> str:
494
+ from app.services.ingestion.hybrid_parser import HybridPDFParser
495
+ try:
496
+ parser = HybridPDFParser()
497
+ result = parser.parse(path)
498
+ # The hybrid parser returns a combined markdown string
499
+ return result.full_markdown if result and result.full_markdown else ""
500
+ except Exception as e:
501
+ print(f"Hybrid PDF Extraction Error: {e}")
502
+ return ""
503
+
504
+ # Process files
505
+ texts = []
506
+ errors = []
507
+
508
+ for f in files:
509
+ if not f.filename.lower().endswith(".pdf"):
510
+ errors.append(f"Skipped non-PDF: {f.filename}")
511
+ continue
512
+
513
+ tmp_path = None
514
+ try:
515
+ # Stream to temp file (low memory usage)
516
+ file_start = time.time()
517
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
518
+ shutil.copyfileobj(f.file, tmp)
519
+ tmp_path = tmp.name
520
+
521
+ file_size = os.path.getsize(tmp_path) / 1024
522
+ print(f"[Train] File '{f.filename}' saved ({file_size:.0f}KB) in {time.time() - file_start:.2f}s")
523
+
524
+ # Run extraction in thread pool (non-blocking)
525
+ extract_start = time.time()
526
+ text = await run_in_threadpool(extract_pdf_text, tmp_path)
527
+ print(f"[Train] Extraction completed in {time.time() - extract_start:.2f}s, {len(text)} chars")
528
+
529
+ if text.strip():
530
+ texts.append(text)
531
+ else:
532
+ errors.append(f"No text extracted from: {f.filename}")
533
+
534
+ except Exception as e:
535
+ print(f"[Train] ERROR processing {f.filename}: {e}")
536
+ errors.append(f"Failed to process {f.filename}: {str(e)}")
537
+ finally:
538
+ # Clean up temp file
539
+ if tmp_path and os.path.exists(tmp_path):
540
+ try:
541
+ os.unlink(tmp_path)
542
+ except:
543
+ pass
544
+
545
+ if not texts:
546
+ raise HTTPException(
547
+ status_code=400,
548
+ detail=f"No valid text extracted from any files. Errors: {errors}"
549
+ )
550
+
551
+ # Extract candidate keywords (CPU intensive, also run in threadpool)
552
+ kw_start = time.time()
553
+ candidates = await run_in_threadpool(extract_candidate_keywords, texts, doc_type)
554
+ print(f"[Train] Keyword extraction completed in {time.time() - kw_start:.2f}s, {len(candidates)} candidates")
555
+ print(f"[Train] TOTAL request time: {time.time() - request_start:.2f}s")
556
+
557
+ return {
558
+ "doc_type": doc_type,
559
+ "files_processed": len(texts),
560
+ "candidates": candidates,
561
+ "errors": errors if errors else None,
562
+ }
563
+
564
+
565
+ class KeywordApprovalRequest(BaseModel):
566
+ doc_type: str
567
+ keywords: List[str]
568
+
569
+
570
+ @router.post("/classifier/approve")
571
+ def approve_classifier_keywords(
572
+ request: KeywordApprovalRequest,
573
+ current_user: User = Depends(get_current_admin)
574
+ ):
575
+ """
576
+ Approve candidate keywords and persist them to the learned registry.
577
+ """
578
+ from app.services.ingestion.keyword_learner import approve_keywords
579
+ from app.services.ingestion.doc_keywords import DOC_TYPE_REGISTRY
580
+
581
+ if request.doc_type not in DOC_TYPE_REGISTRY:
582
+ raise HTTPException(
583
+ status_code=400,
584
+ detail=f"Unknown doc type: {request.doc_type}"
585
+ )
586
+
587
+ if not request.keywords:
588
+ raise HTTPException(
589
+ status_code=400,
590
+ detail="No keywords provided."
591
+ )
592
+
593
+ result = approve_keywords(request.doc_type, request.keywords)
594
+ return result
595
+
596
+
597
+ @router.get("/classifier/stats")
598
+ def get_classifier_stats(
599
+ current_user: User = Depends(get_current_admin)
600
+ ):
601
+ """
602
+ Get classifier training statistics for the admin dashboard.
603
+ """
604
+ from app.services.ingestion.keyword_learner import get_training_stats
605
+ return get_training_stats()
606
+
app/api/auth.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime, timedelta
2
+ from typing import Optional
3
+ from fastapi import APIRouter, Depends, HTTPException, status
4
+ from fastapi.security import OAuth2PasswordBearer, OAuth2PasswordRequestForm
5
+ from jose import JWTError, jwt
6
+ from passlib.context import CryptContext
7
+ from sqlalchemy.orm import Session
8
+ from app.core.database import get_db
9
+ from app.models.user import User
10
+ from app.schemas.user import UserCreate, UserResponse, Token, UpgradeRequest
11
+ from app.core.security import SECRET_KEY, ALGORITHM, ACCESS_TOKEN_EXPIRE_MINUTES
12
+
13
+ from app.core.security import verify_password, get_password_hash, create_access_token, ALGORITHM, SECRET_KEY, ACCESS_TOKEN_EXPIRE_MINUTES
14
+
15
+ router = APIRouter(prefix="/auth", tags=["auth"])
16
+
17
+ @router.get("/probe")
18
+ def probe():
19
+ return {"status": "auth_router_working"}
20
+
21
+ from fastapi import Query
22
+
23
+ oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/api/v1/auth/login", auto_error=False)
24
+
25
+ @router.post("/register", response_model=UserResponse)
26
+ def register(user: UserCreate, db: Session = Depends(get_db)):
27
+ try:
28
+ db_user = db.query(User).filter(User.email == user.email).first()
29
+ if db_user:
30
+ raise HTTPException(status_code=400, detail="Email already registered")
31
+ hashed_password = get_password_hash(user.password)
32
+
33
+ # Valid Admin Keys
34
+ VALID_ADMIN_KEYS = [
35
+ "VSQADM001", "VSQADM002", "VSQADM003",
36
+ "VSQADM004", "VSQADM005", "VSQADM006"
37
+ ]
38
+
39
+ # Check Admin Key
40
+ is_admin = False
41
+ is_super_admin = False
42
+ SUPER_ADMIN_KEYS = ["VSQADM003", "VSQADM006"]
43
+
44
+ if user.admin_key and user.admin_key in VALID_ADMIN_KEYS:
45
+ is_admin = True
46
+ if user.admin_key in SUPER_ADMIN_KEYS:
47
+ is_super_admin = True
48
+
49
+ # Generate Visique ID
50
+ import uuid
51
+ import random
52
+ if is_admin:
53
+ # VISI-###### (6 digits)
54
+ digits = ''.join([str(random.randint(0, 9)) for _ in range(6)])
55
+ visique_id = f"VISI-{digits}"
56
+ else:
57
+ visique_id = f"VSQ-{str(uuid.uuid4())[:8].upper()}"
58
+
59
+ new_user = User(
60
+ email=user.email,
61
+ hashed_password=hashed_password,
62
+ full_name=user.full_name,
63
+ company_name=user.company_name,
64
+ is_admin=is_admin,
65
+ is_super_admin=is_super_admin,
66
+ visique_id=visique_id
67
+ )
68
+ db.add(new_user)
69
+ db.commit()
70
+ db.refresh(new_user)
71
+ return new_user
72
+ except HTTPException as he:
73
+ raise he
74
+ except Exception as e:
75
+ print(f"Registration Error: {str(e)}")
76
+ raise HTTPException(status_code=500, detail=f"Registration failed: {str(e)}")
77
+
78
+ @router.post("/login", response_model=Token)
79
+ def login(form_data: OAuth2PasswordRequestForm = Depends(), db: Session = Depends(get_db)):
80
+ user = db.query(User).filter(User.email == form_data.username).first()
81
+ if not user or not verify_password(form_data.password, user.hashed_password):
82
+ raise HTTPException(
83
+ status_code=status.HTTP_401_UNAUTHORIZED,
84
+ detail="Incorrect username or password",
85
+ headers={"WWW-Authenticate": "Bearer"},
86
+ )
87
+ access_token_expires = timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
88
+ access_token = create_access_token(
89
+ data={"sub": user.email}, expires_delta=access_token_expires
90
+ )
91
+ return {"access_token": access_token, "token_type": "bearer"}
92
+
93
+ async def get_current_user(
94
+ token: Optional[str] = Depends(oauth2_scheme),
95
+ db: Session = Depends(get_db),
96
+ # Frontend passes ?token=... for downloads
97
+ query_token: Optional[str] = Query(None, alias="token")
98
+ ):
99
+ actual_token = token or query_token
100
+
101
+ credentials_exception = HTTPException(
102
+ status_code=status.HTTP_401_UNAUTHORIZED,
103
+ detail="Could not validate credentials",
104
+ headers={"WWW-Authenticate": "Bearer"},
105
+ )
106
+ if not actual_token:
107
+ raise credentials_exception
108
+
109
+ try:
110
+ payload = jwt.decode(actual_token, SECRET_KEY, algorithms=[ALGORITHM])
111
+ email: str = payload.get("sub")
112
+ if email is None:
113
+ raise credentials_exception
114
+ except JWTError:
115
+ raise credentials_exception
116
+ user = db.query(User).filter(User.email == email).first()
117
+ if user is None:
118
+ raise credentials_exception
119
+ return user
120
+
121
+ @router.get("/me", response_model=UserResponse)
122
+ async def read_users_me(current_user: User = Depends(get_current_user)):
123
+ return current_user
124
+
125
+ from pydantic import BaseModel
126
+
127
+ class ProfileUpdate(BaseModel):
128
+ full_name: Optional[str] = None
129
+ company_name: Optional[str] = None
130
+ address: Optional[str] = None
131
+
132
+ @router.patch("/me", response_model=UserResponse)
133
+ async def update_profile(
134
+ updates: ProfileUpdate,
135
+ current_user: User = Depends(get_current_user),
136
+ db: Session = Depends(get_db)
137
+ ):
138
+ if updates.full_name is not None:
139
+ current_user.full_name = updates.full_name
140
+ if updates.company_name is not None and updates.company_name != current_user.company_name:
141
+ current_user.company_name = updates.company_name
142
+ # Propagate to all past analyses
143
+ from app.models.user import Analysis
144
+ import json
145
+
146
+ analyses = db.query(Analysis).filter(Analysis.user_id == current_user.id).all()
147
+ for analysis in analyses:
148
+ analysis.company_name = updates.company_name
149
+ if analysis.result_json:
150
+ try:
151
+ data = json.loads(analysis.result_json)
152
+ if "raw_data" in data and "company_name" in data["raw_data"]:
153
+ data["raw_data"]["company_name"] = updates.company_name
154
+ analysis.result_json = json.dumps(data)
155
+ except Exception as e:
156
+ print(f"Error updating result_json for analysis {analysis.id}: {e}")
157
+
158
+ if updates.address is not None:
159
+ current_user.address = updates.address
160
+ db.commit()
161
+ db.refresh(current_user)
162
+ return current_user
163
+
164
+ from app.core.config import settings
165
+ from app.core.stripe_config import create_checkout_session
166
+ import stripe
167
+ from fastapi import Request
168
+
169
+ @router.post("/create-checkout-session")
170
+ def create_payment(
171
+ plan_id: str, # Pass the Stripe Price ID
172
+ current_user: User = Depends(get_current_user),
173
+ db: Session = Depends(get_db)
174
+ ):
175
+ session = create_checkout_session(current_user, plan_id)
176
+ if not session:
177
+ raise HTTPException(status_code=400, detail="Error creating payment session")
178
+ return {"url": session.url}
179
+
180
+ @router.post("/webhook")
181
+ async def stripe_webhook(request: Request, db: Session = Depends(get_db)):
182
+ payload = await request.body()
183
+ sig_header = request.headers.get("stripe-signature")
184
+
185
+ try:
186
+ event = stripe.Webhook.construct_event(
187
+ payload, sig_header, settings.STRIPE_WEBHOOK_SECRET
188
+ )
189
+ except ValueError as e:
190
+ raise HTTPException(status_code=400, detail="Invalid payload")
191
+ except stripe.error.SignatureVerificationError as e:
192
+ raise HTTPException(status_code=400, detail="Invalid signature")
193
+
194
+ if event["type"] == "checkout.session.completed":
195
+ session = event["data"]["object"]
196
+
197
+ # Retrieve user and update plan
198
+ # Note: metadata values are strings
199
+ user_id = session.get("client_reference_id")
200
+ if user_id:
201
+ user = db.query(User).filter(User.id == int(user_id)).first()
202
+ if user:
203
+ user.plan = "Business" # Or derive from session
204
+ user.plan_expires_at = datetime.utcnow() + timedelta(days=30)
205
+
206
+ # Record Payment
207
+ from app.models.user import Payment
208
+ new_payment = Payment(
209
+ user_id=user.id,
210
+ amount=session.get("amount_total", 0) / 100.0,
211
+ status="paid",
212
+ plan_name="Business",
213
+ date=datetime.utcnow()
214
+ )
215
+ db.add(new_payment)
216
+ db.commit()
217
+
218
+ return {"status": "success"}
219
+
220
+ from typing import List
221
+ from app.schemas.user import PaymentResponse
222
+ from app.models.user import Payment
223
+
224
+ @router.get("/payments/me", response_model=List[PaymentResponse])
225
+ def read_my_payments(current_user: User = Depends(get_current_user), db: Session = Depends(get_db)):
226
+ return db.query(Payment).filter(Payment.user_id == current_user.id).all()
227
+
228
+ from fastapi import UploadFile, File
229
+ from fastapi.responses import RedirectResponse
230
+ from app.services.storage import StorageService
231
+ import uuid
232
+
233
+ @router.post("/me/avatar")
234
+ async def upload_avatar(
235
+ file: UploadFile = File(...),
236
+ current_user: User = Depends(get_current_user),
237
+ db: Session = Depends(get_db)
238
+ ):
239
+ # Determine file extension
240
+ ext = file.filename.split(".")[-1]
241
+ if ext.lower() not in ["jpg", "jpeg", "png", "webp"]:
242
+ raise HTTPException(status_code=400, detail="Invalid image format. Use JPG, PNG, or WebP.")
243
+
244
+ # Save file to R2
245
+ safe_filename = f"{current_user.id}_{uuid.uuid4()}.{ext}"
246
+ object_key = f"avatars/{safe_filename}"
247
+
248
+ try:
249
+ await StorageService.upload_file(file, object_key)
250
+ except Exception as e:
251
+ raise HTTPException(status_code=500, detail=f"Avatar upload failed: {str(e)}")
252
+
253
+ # Update User Profile
254
+ # Point to the proxy endpoint
255
+ current_user.profile_picture_url = f"/api/v1/auth/avatars/{safe_filename}"
256
+
257
+ db.commit()
258
+ db.refresh(current_user)
259
+
260
+ return {"message": "Avatar updated", "url": current_user.profile_picture_url}
261
+
262
+ @router.get("/avatars/{filename}")
263
+ async def get_avatar(filename: str):
264
+ """
265
+ Proxy endpoint for avatars.
266
+ Redirects to a short-lived presigned URL on R2.
267
+ """
268
+ object_key = f"avatars/{filename}"
269
+ try:
270
+ url = StorageService.get_presigned_url(object_key, expiration=3600) # 1 hour cache
271
+ return RedirectResponse(url=url)
272
+ except Exception:
273
+ raise HTTPException(status_code=404, detail="Avatar not found")
app/api/endpoints.py ADDED
@@ -0,0 +1,830 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, UploadFile, File, HTTPException, Depends
2
+ from fastapi.concurrency import run_in_threadpool
3
+ from fastapi.responses import RedirectResponse
4
+ from fastapi.security import OAuth2PasswordBearer, OAuth2PasswordRequestForm
5
+ from app.schemas.financial import StandardizedDataPackage
6
+ from app.services.storage import StorageService
7
+ from app.core.security import create_access_token
8
+ from typing import Annotated
9
+ from pydantic import BaseModel
10
+ from datetime import date
11
+ import os
12
+ from app.services.ingestion.parser_csv import CSVParser
13
+ from app.services.ingestion.parser_pdf import PDFParser
14
+ from app.services.analysis.kpi import KPIAnalyzer
15
+ from app.services.analysis.risk import RiskAnalyzer
16
+ from app.services.analysis.health_score import HealthScoreAnalyzer
17
+ from app.services.analysis.fundamental import FundamentalAnalyzer
18
+ from app.services.analysis.factory import AnalysisFactory
19
+ from app.services.analysis.growth import GrowthAnalyzer
20
+ from app.services.analysis.simulation import SimulationService
21
+ from app.services.reporting.pdf_report import PDFReporter
22
+ from app.services.reporting.pptx_report import PPTXReporter
23
+ from app.schemas.financial import StandardizedDataPackage, FinancialReport, IncomeStatementStandard, BalanceSheetStandard, CashFlowStandard, KPIMetrics, RiskAnalysis, HealthScoreBreakdown
24
+ from financial_model.models import VisiVeritasReport
25
+ from app.schemas.chat import ChatRequest, ChatResponse
26
+ from app.api.auth import get_current_user
27
+ from app.models.user import User, Analysis
28
+ from app.core.database import get_db
29
+ from sqlalchemy.orm import Session
30
+ import json
31
+ from fastapi.responses import FileResponse
32
+ from app.services.feature_service import get_effective_features
33
+
34
+ router = APIRouter(prefix="/analysis", tags=["analysis"])
35
+ oauth2_scheme = OAuth2PasswordBearer(tokenUrl="token")
36
+
37
+ @router.post("/token")
38
+ async def login(form_data: Annotated[OAuth2PasswordRequestForm, Depends()]):
39
+ # Mock User DB (kept for legacy demo, but real auth is at /auth/login)
40
+ if form_data.username == "analyst" and form_data.password == "visique":
41
+ return {"access_token": create_access_token(data={"sub": form_data.username}), "token_type": "bearer"}
42
+ raise HTTPException(status_code=400, detail="Incorrect username or password")
43
+
44
+ @router.get("/doc-types")
45
+ def get_doc_types(current_user: User = Depends(get_current_user)):
46
+ """Get all supported document types for the upload dropdown."""
47
+ from app.services.ingestion.doc_keywords import get_all_doc_types_summary
48
+ return get_all_doc_types_summary()
49
+
50
+ # Admin Dependency
51
+ def get_current_admin(current_user: User = Depends(get_current_user)):
52
+ if not current_user.is_admin:
53
+ raise HTTPException(status_code=403, detail="Admin privileges required")
54
+ return current_user
55
+ if not current_user.is_admin:
56
+ raise HTTPException(status_code=403, detail="Admin privileges required")
57
+ return current_user
58
+
59
+ @router.get("/admin/users")
60
+ def get_all_users(
61
+ admin: User = Depends(get_current_admin),
62
+ db: Session = Depends(get_db)
63
+ ):
64
+ users = db.query(User).all()
65
+ return [
66
+ {
67
+ "id": u.id,
68
+ "email": u.email,
69
+ "full_name": u.full_name,
70
+ "company_name": u.company_name,
71
+ "is_admin": u.is_admin,
72
+ "created_at": u.created_at,
73
+ "analysis_count": len(u.analyses),
74
+ "preferred_engine": getattr(u, "preferred_engine", "v1")
75
+ }
76
+ for u in users
77
+ ]
78
+
79
+ @router.get("/admin/analyses")
80
+ def get_all_analyses(
81
+ admin: User = Depends(get_current_admin),
82
+ db: Session = Depends(get_db)
83
+ ):
84
+ analyses = db.query(Analysis).order_by(Analysis.timestamp.desc()).all()
85
+ return [
86
+ {
87
+ "id": a.id,
88
+ "user_email": a.owner.email,
89
+ "user_company": a.owner.company_name,
90
+ "company_name": a.company_name,
91
+ "filename": a.input_filename,
92
+ "timestamp": a.timestamp,
93
+ }
94
+ for a in analyses
95
+ ]
96
+
97
+ @router.get("/admin/analyses/{analysis_id}/download")
98
+ def admin_download_file(
99
+ analysis_id: int,
100
+ admin: User = Depends(get_current_admin),
101
+ db: Session = Depends(get_db)
102
+ ):
103
+ analysis = db.query(Analysis).filter(Analysis.id == analysis_id).first()
104
+ if not analysis or not analysis.stored_filename:
105
+ raise HTTPException(status_code=404, detail="File not found")
106
+
107
+ if not os.path.exists(analysis.stored_filename):
108
+ raise HTTPException(status_code=404, detail="File missing from server storage")
109
+
110
+ return FileResponse(
111
+ path=analysis.stored_filename,
112
+ filename=f"ADMIN_EXPORT_{analysis.input_filename}",
113
+ media_type='application/octet-stream'
114
+ )
115
+
116
+
117
+ import json
118
+
119
+ # Admin Dependency
120
+ def get_current_admin(current_user: User = Depends(get_current_user)):
121
+ if not current_user.is_admin:
122
+ raise HTTPException(status_code=403, detail="Admin privileges required")
123
+ return current_user
124
+
125
+ @router.post("/upload/csv", response_model=StandardizedDataPackage)
126
+ async def analyze_csv(
127
+ file: UploadFile = File(...),
128
+ current_user: User = Depends(get_current_user),
129
+ db: Session = Depends(get_db)
130
+ ):
131
+ # Check upload limit
132
+ from app.services.feature_service import check_upload_limit, increment_upload_count
133
+ limit_check = check_upload_limit(db, current_user)
134
+ if not limit_check["can_upload"]:
135
+ raise HTTPException(
136
+ status_code=403,
137
+ detail=f"Monthly upload limit reached ({limit_check['uploads_limit']} uploads). Upgrade your plan for more uploads. Resets on {limit_check['reset_date'][:10]}."
138
+ )
139
+
140
+ if not file.filename.endswith('.csv'):
141
+ raise HTTPException(status_code=400, detail="Invalid file type. Please upload a .csv file.")
142
+
143
+ # Secure filename and path
144
+ import uuid
145
+ safe_filename = f"{uuid.uuid4()}_{file.filename}"
146
+ object_key = f"uploads/{safe_filename}"
147
+
148
+ try:
149
+ # Upload to R2 (Service enforces size limit)
150
+ await StorageService.upload_file(file, object_key)
151
+
152
+ # Determine file type
153
+ # For analysis, we need the content. Fetch it back or stream it?
154
+ # Since parsers expect a file path, we might need to update parsers OR
155
+ # download to a temp file for parsing.
156
+ # Given current Parser implementations likely take a path, let's download to /tmp
157
+
158
+ temp_path = f"/tmp/{safe_filename}"
159
+ with open(temp_path, "wb") as f:
160
+ # We can likely read from the file object again if we seek 0,
161
+ # BUT StorageService.upload_file consumed it.
162
+ # Easier: fetch from R2 or write to tmp first, then upload?
163
+ # WRITING TO TMP FIRST IS SAFER for parsing, then upload for persistence.
164
+ # Wait, if we use R2 for persistence, we should upload first to ensure we can.
165
+
166
+ # Optimization: Write to tmp first (fast), parse, then upload background?
167
+ # Or: Upload to R2, then download to tmp for parsing?
168
+ # Or: Seek file to 0 and write to tmp.
169
+ file.file.seek(0)
170
+ import shutil
171
+ shutil.copyfileobj(file.file, f)
172
+
173
+ report = await run_in_threadpool(CSVParser.parse, temp_path)
174
+
175
+ # Clean up temp file
176
+ if os.path.exists(temp_path):
177
+ os.remove(temp_path)
178
+
179
+ # Run Unified Analysis (includes Phase 2 & 3 extensions)
180
+ # Select Engine based on User Preference
181
+ analyzer = AnalysisFactory.get_analyzer(current_user)
182
+ # Fetch enabled features for user's plan
183
+ enabled_features = get_effective_features(db, current_user.plan or "Free")
184
+ analysis_result = await run_in_threadpool(analyzer.analyze, report, user_address=current_user.address, enabled_features=enabled_features)
185
+
186
+ # The analyze() method returns: kpis, health_score, risk_analysis, insights (industry), recommendations, variance, runway, optimization
187
+
188
+ # Combine industry insights + recommendations + manual pain points if needed
189
+ # Note: FundamentalAnalyzer.analyze now handles most of this, but 'pain points' logic is inside recommendations or separate?
190
+
191
+ # Combine text insights
192
+ # Include risk_factors (which contain "Pain Point:" entries) in the insights array
193
+ risk_factors = analysis_result["risk_analysis"].risk_factors if analysis_result.get("risk_analysis") else []
194
+ all_insights = analysis_result["insights"] + analysis_result["recommendations"] + risk_factors
195
+
196
+ # ---- Visi-Veritas Hard-Halt Check ----
197
+ veritas_data = analysis_result.get("visi_veritas", {})
198
+ confidence = veritas_data.get("confidence_score", 100)
199
+ if confidence < 30:
200
+ raise HTTPException(
201
+ status_code=400,
202
+ detail={
203
+ "error": "Visi-Veritas validation failed: The extracted financial data contains critical inaccuracies.",
204
+ "confidence_score": confidence,
205
+ "failed_rules": veritas_data.get("failed_rules", []),
206
+ "warnings": veritas_data.get("warnings", []),
207
+ "debug_context": veritas_data.get("debug_context", {}),
208
+ }
209
+ )
210
+
211
+ result_package = StandardizedDataPackage(
212
+ raw_data=report,
213
+ kpis=analysis_result["kpis"],
214
+ risk_analysis=analysis_result["risk_analysis"],
215
+ health_score=analysis_result["health_score"],
216
+ insights=all_insights,
217
+ runway_forecast=analysis_result["runway_forecast"],
218
+ optimization_insights=analysis_result["optimization_insights"],
219
+ geo_analysis=analysis_result.get("geo_analysis"),
220
+ visi_veritas=VisiVeritasReport(**veritas_data) if veritas_data else None,
221
+ )
222
+
223
+ # Save to DB
224
+ db_analysis = Analysis(
225
+ user_id=current_user.id,
226
+ company_name=report.company_name,
227
+ input_filename=file.filename,
228
+ stored_filename=object_key,
229
+ result_json=result_package.json()
230
+ )
231
+ db.add(db_analysis)
232
+ db.commit()
233
+ db.refresh(db_analysis)
234
+
235
+ result_package.analysis_id = db_analysis.id
236
+ result_package.timestamp = db_analysis.timestamp.isoformat()
237
+ result_package.runner_name = current_user.full_name or current_user.email
238
+
239
+ # Increment upload count AFTER successful save
240
+ increment_upload_count(db, current_user)
241
+
242
+ return result_package
243
+
244
+ except Exception as e:
245
+ # Cleanup if analysis fails
246
+ # If we uploaded to R2, strictly we should delete it?
247
+ # But if it failed before DB save, it's just an orphaned file.
248
+ # Ideally delete from R2.
249
+ # StorageService.delete_file(object_key)
250
+ raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
251
+
252
+ from fastapi import Body
253
+
254
+ @router.post("/save")
255
+ async def save_analysis_result(
256
+ payload: dict = Body(...),
257
+ current_user: User = Depends(get_current_user),
258
+ db: Session = Depends(get_db)
259
+ ):
260
+ """
261
+ Receives pre-computed analysis results from Vercel serverless functions
262
+ and persists them to the database. This endpoint does NOT run analysis -
263
+ it only handles authentication and database storage.
264
+ """
265
+ from app.services.feature_service import increment_upload_count
266
+ try:
267
+ company_name = "Unknown"
268
+ raw_data = payload.get("raw_data", {})
269
+ if isinstance(raw_data, dict):
270
+ company_name = raw_data.get("company_name", "Unknown")
271
+
272
+ original_filename = payload.pop("original_filename", "uploaded_file")
273
+
274
+ db_analysis = Analysis(
275
+ user_id=current_user.id,
276
+ company_name=company_name,
277
+ input_filename=original_filename,
278
+ stored_filename="vercel_processed",
279
+ result_json=json.dumps(payload)
280
+ )
281
+ db.add(db_analysis)
282
+ db.commit()
283
+ db.refresh(db_analysis)
284
+
285
+ # Increment upload count
286
+ increment_upload_count(db, current_user)
287
+
288
+ return {"status": "saved", "analysis_id": db_analysis.id}
289
+ except Exception as e:
290
+ raise HTTPException(status_code=500, detail=f"Failed to save analysis: {str(e)}")
291
+
292
+ @router.get("/history")
293
+ def get_history(
294
+ current_user: User = Depends(get_current_user),
295
+ db: Session = Depends(get_db)
296
+ ):
297
+ analyses = db.query(Analysis).filter(Analysis.user_id == current_user.id).order_by(Analysis.timestamp.desc()).all()
298
+ return [
299
+ {
300
+ "id": a.id,
301
+ "company_name": a.company_name,
302
+ "filename": a.input_filename,
303
+ "timestamp": a.timestamp,
304
+ "runner_name": a.owner.full_name or a.owner.email,
305
+ }
306
+ for a in analyses
307
+ ]
308
+
309
+ @router.get("/history/{analysis_id}", response_model=StandardizedDataPackage)
310
+ def get_analysis_detail(
311
+ analysis_id: int,
312
+ current_user: User = Depends(get_current_user),
313
+ db: Session = Depends(get_db)
314
+ ):
315
+ analysis = db.query(Analysis).filter(Analysis.id == analysis_id, Analysis.user_id == current_user.id).first()
316
+ if not analysis:
317
+ raise HTTPException(status_code=404, detail="Analysis not found")
318
+
319
+ pkg = StandardizedDataPackage.parse_raw(analysis.result_json)
320
+ pkg.analysis_id = analysis.id
321
+ pkg.timestamp = analysis.timestamp.isoformat()
322
+ pkg.runner_name = analysis.owner.full_name or analysis.owner.email
323
+ return pkg
324
+
325
+ @router.get("/history/{analysis_id}/download")
326
+ def download_original_file(
327
+ analysis_id: int,
328
+ current_user: User = Depends(get_current_user),
329
+ db: Session = Depends(get_db)
330
+ ):
331
+ analysis = db.query(Analysis).filter(Analysis.id == analysis_id, Analysis.user_id == current_user.id).first()
332
+ if not analysis or not analysis.stored_filename:
333
+ raise HTTPException(status_code=404, detail="File not found")
334
+
335
+ # Generate Presigned URL for Redirect
336
+ try:
337
+ url = StorageService.get_presigned_url(analysis.stored_filename)
338
+ return RedirectResponse(url=url)
339
+ except Exception as e:
340
+ raise HTTPException(status_code=404, detail="File missing from storage")
341
+
342
+
343
+ @router.delete("/history/{analysis_id}")
344
+ def delete_analysis(
345
+ analysis_id: int,
346
+ current_user: User = Depends(get_current_user),
347
+ db: Session = Depends(get_db)
348
+ ):
349
+ analysis = db.query(Analysis).filter(Analysis.id == analysis_id, Analysis.user_id == current_user.id).first()
350
+ if not analysis:
351
+ raise HTTPException(status_code=404, detail="Analysis not found")
352
+
353
+ # Delete file from R2
354
+ if analysis.stored_filename:
355
+ StorageService.delete_file(analysis.stored_filename)
356
+
357
+ db.delete(analysis)
358
+ db.commit()
359
+ return {"status": "success", "message": "Analysis deleted"}
360
+
361
+ class UpdateAnalysisRequest(BaseModel):
362
+ company_name: str
363
+
364
+ @router.patch("/history/{analysis_id}")
365
+ def update_analysis(
366
+ analysis_id: int,
367
+ request: UpdateAnalysisRequest,
368
+ current_user: User = Depends(get_current_user),
369
+ db: Session = Depends(get_db)
370
+ ):
371
+ analysis = db.query(Analysis).filter(Analysis.id == analysis_id, Analysis.user_id == current_user.id).first()
372
+ if not analysis:
373
+ raise HTTPException(status_code=404, detail="Analysis not found")
374
+
375
+ analysis.company_name = request.company_name
376
+
377
+ # Update the stored JSON to reflect new name (consistency)
378
+ try:
379
+ data = json.loads(analysis.result_json)
380
+ data['raw_data']['company_name'] = request.company_name
381
+ analysis.result_json = json.dumps(data)
382
+ except:
383
+ pass # If JSON parsing fails, just update DB record
384
+
385
+ db.commit()
386
+ return {"status": "success", "message": "Analysis updated", "company_name": analysis.company_name}
387
+
388
+ @router.post("/upload/pdf", response_model=StandardizedDataPackage)
389
+ async def analyze_pdf(
390
+ file: UploadFile = File(...),
391
+ current_user: User = Depends(get_current_user),
392
+ db: Session = Depends(get_db)
393
+ ):
394
+ # Check upload limit
395
+ from app.services.feature_service import check_upload_limit, increment_upload_count
396
+ limit_check = check_upload_limit(db, current_user)
397
+ if not limit_check["can_upload"]:
398
+ raise HTTPException(
399
+ status_code=403,
400
+ detail=f"Monthly upload limit reached ({limit_check['uploads_limit']} uploads). Upgrade your plan for more uploads. Resets on {limit_check['reset_date'][:10]}."
401
+ )
402
+
403
+ if not file.filename.endswith('.pdf'):
404
+ raise HTTPException(status_code=400, detail="Invalid file type. Please upload a .pdf file.")
405
+
406
+ import uuid
407
+ safe_filename = f"{uuid.uuid4()}_{file.filename}"
408
+ object_key = f"uploads/{safe_filename}"
409
+
410
+ try:
411
+ # Upload to R2 (Limit Enforced)
412
+ await StorageService.upload_file(file, object_key)
413
+
414
+ # Parse logic
415
+ temp_path = f"/tmp/{safe_filename}"
416
+ file.file.seek(0)
417
+ with open(temp_path, "wb") as f:
418
+ import shutil
419
+ shutil.copyfileobj(file.file, f)
420
+
421
+ report = await run_in_threadpool(PDFParser.parse, temp_path)
422
+
423
+ # Cleanup temp
424
+ if os.path.exists(temp_path):
425
+ os.remove(temp_path)
426
+
427
+ # Run Unified Analysis
428
+ # Select Engine based on User Preference
429
+ analyzer = AnalysisFactory.get_analyzer(current_user)
430
+
431
+ # Resolve all feature flags (Plan + Custom + Engine limits)
432
+ from app.services.feature_service import resolve_user_features
433
+ enabled_features = resolve_user_features(db, current_user)
434
+
435
+ analysis_result = await run_in_threadpool(analyzer.analyze, report, user_address=current_user.address, enabled_features=enabled_features)
436
+
437
+ # Include risk_factors (which contain "Pain Point:" entries) in the insights array
438
+ risk_factors = analysis_result["risk_analysis"].risk_factors if analysis_result.get("risk_analysis") else []
439
+ all_insights = analysis_result["insights"] + analysis_result["recommendations"] + risk_factors
440
+
441
+ # ---- Visi-Veritas Hard-Halt Check ----
442
+ veritas_data = analysis_result.get("visi_veritas", {})
443
+ confidence = veritas_data.get("confidence_score", 100)
444
+ if confidence < 30:
445
+ raise HTTPException(
446
+ status_code=400,
447
+ detail={
448
+ "error": "Visi-Veritas validation failed: The extracted financial data contains critical inaccuracies.",
449
+ "confidence_score": confidence,
450
+ "failed_rules": veritas_data.get("failed_rules", []),
451
+ "warnings": veritas_data.get("warnings", []),
452
+ "debug_context": veritas_data.get("debug_context", {}),
453
+ }
454
+ )
455
+
456
+ result_package = StandardizedDataPackage(
457
+ raw_data=report,
458
+ kpis=analysis_result["kpis"],
459
+ risk_analysis=analysis_result["risk_analysis"],
460
+ health_score=analysis_result["health_score"],
461
+ insights=all_insights,
462
+ runway_forecast=analysis_result["runway_forecast"],
463
+ optimization_insights=analysis_result["optimization_insights"],
464
+ geo_analysis=analysis_result.get("geo_analysis"),
465
+ visi_veritas=VisiVeritasReport(**veritas_data) if veritas_data else None,
466
+ )
467
+
468
+ # Save to DB
469
+ db_analysis = Analysis(
470
+ user_id=current_user.id,
471
+ company_name=report.company_name,
472
+ input_filename=file.filename,
473
+ stored_filename=object_key,
474
+ result_json=result_package.json()
475
+ )
476
+ db.add(db_analysis)
477
+ db.commit()
478
+ db.refresh(db_analysis)
479
+
480
+ result_package.analysis_id = db_analysis.id
481
+ result_package.timestamp = db_analysis.timestamp.isoformat()
482
+ result_package.runner_name = current_user.full_name or current_user.email
483
+
484
+ # Increment upload count AFTER successful save
485
+ increment_upload_count(db, current_user)
486
+
487
+ return result_package
488
+
489
+ except Exception as e:
490
+ raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
491
+
492
+
493
+ # =============================================================================
494
+ # XLSX UPLOAD ENDPOINT
495
+ # =============================================================================
496
+
497
+ @router.post("/upload/xlsx", response_model=StandardizedDataPackage)
498
+ async def analyze_xlsx(
499
+ file: UploadFile = File(...),
500
+ current_user: User = Depends(get_current_user),
501
+ db: Session = Depends(get_db)
502
+ ):
503
+ """Upload and analyze an Excel (.xlsx, .xls) file."""
504
+ # Check upload limit
505
+ from app.services.feature_service import check_upload_limit, increment_upload_count
506
+ limit_check = check_upload_limit(db, current_user)
507
+ if not limit_check["can_upload"]:
508
+ raise HTTPException(
509
+ status_code=403,
510
+ detail=f"Monthly upload limit reached ({limit_check['uploads_limit']} uploads). Upgrade your plan for more uploads. Resets on {limit_check['reset_date'][:10]}."
511
+ )
512
+
513
+ if not (file.filename.endswith('.xlsx') or file.filename.endswith('.xls')):
514
+ raise HTTPException(status_code=400, detail="Invalid file type. Please upload an .xlsx or .xls file.")
515
+
516
+ import uuid
517
+ safe_filename = f"{uuid.uuid4()}_{file.filename}"
518
+ object_key = f"uploads/{safe_filename}"
519
+
520
+ try:
521
+ # Upload to R2
522
+ await StorageService.upload_file(file, object_key)
523
+
524
+ temp_path = f"/tmp/{safe_filename}"
525
+ file.file.seek(0)
526
+ with open(temp_path, "wb") as f:
527
+ import shutil
528
+ shutil.copyfileobj(file.file, f)
529
+
530
+ # Use XLSX Parser
531
+ from app.services.ingestion.parser_xlsx import XLSXParser
532
+ report = await run_in_threadpool(XLSXParser.parse, temp_path)
533
+
534
+ # Cleanup
535
+ if os.path.exists(temp_path):
536
+ os.remove(temp_path)
537
+
538
+ # Run Unified Analysis
539
+ # Select Engine based on User Preference
540
+ analyzer = AnalysisFactory.get_analyzer(current_user)
541
+
542
+ # Resolve all feature flags (Plan + Custom + Engine limits)
543
+ from app.services.feature_service import resolve_user_features
544
+ enabled_features = resolve_user_features(db, current_user)
545
+
546
+ analysis_result = await run_in_threadpool(analyzer.analyze, report, user_address=current_user.address, enabled_features=enabled_features)
547
+
548
+ risk_factors = analysis_result["risk_analysis"].risk_factors if analysis_result.get("risk_analysis") else []
549
+ all_insights = analysis_result["insights"] + analysis_result["recommendations"] + risk_factors
550
+
551
+ # ---- Visi-Veritas Hard-Halt Check ----
552
+ veritas_data = analysis_result.get("visi_veritas", {})
553
+ confidence = veritas_data.get("confidence_score", 100)
554
+ if confidence < 30:
555
+ raise HTTPException(
556
+ status_code=400,
557
+ detail={
558
+ "error": "Visi-Veritas validation failed: The extracted financial data contains critical inaccuracies.",
559
+ "confidence_score": confidence,
560
+ "failed_rules": veritas_data.get("failed_rules", []),
561
+ "warnings": veritas_data.get("warnings", []),
562
+ "debug_context": veritas_data.get("debug_context", {}),
563
+ }
564
+ )
565
+
566
+ result_package = StandardizedDataPackage(
567
+ raw_data=report,
568
+ kpis=analysis_result["kpis"],
569
+ risk_analysis=analysis_result["risk_analysis"],
570
+ health_score=analysis_result["health_score"],
571
+ insights=all_insights,
572
+ runway_forecast=analysis_result["runway_forecast"],
573
+ optimization_insights=analysis_result["optimization_insights"],
574
+ geo_analysis=analysis_result.get("geo_analysis"),
575
+ visi_veritas=VisiVeritasReport(**veritas_data) if veritas_data else None,
576
+ )
577
+
578
+ # Save to DB
579
+ db_analysis = Analysis(
580
+ user_id=current_user.id,
581
+ company_name=report.company_name,
582
+ input_filename=file.filename,
583
+ stored_filename=object_key,
584
+ result_json=result_package.json()
585
+ )
586
+ db.add(db_analysis)
587
+ db.commit()
588
+ db.refresh(db_analysis)
589
+
590
+ result_package.analysis_id = db_analysis.id
591
+ result_package.timestamp = db_analysis.timestamp.isoformat()
592
+ result_package.runner_name = current_user.full_name or current_user.email
593
+
594
+ # Increment upload count
595
+ increment_upload_count(db, current_user)
596
+
597
+ return result_package
598
+
599
+ except Exception as e:
600
+ raise HTTPException(status_code=500, detail=f"XLSX Analysis failed: {str(e)}")
601
+
602
+
603
+ # =============================================================================
604
+ # BULK DELETE ENDPOINTS
605
+ # =============================================================================
606
+
607
+ class BulkDeleteRequest(BaseModel):
608
+ ids: list[int]
609
+
610
+ @router.delete("/history/bulk-delete")
611
+ def bulk_delete_analyses(
612
+ request: BulkDeleteRequest,
613
+ current_user: User = Depends(get_current_user),
614
+ db: Session = Depends(get_db)
615
+ ):
616
+ """Delete multiple analyses at once."""
617
+ deleted_count = 0
618
+ errors = []
619
+
620
+ for analysis_id in request.ids:
621
+ analysis = db.query(Analysis).filter(
622
+ Analysis.id == analysis_id,
623
+ Analysis.user_id == current_user.id
624
+ ).first()
625
+
626
+ if not analysis:
627
+ errors.append(f"Analysis {analysis_id} not found")
628
+ continue
629
+
630
+ # Delete file from disk
631
+ if analysis.stored_filename:
632
+ StorageService.delete_file(analysis.stored_filename)
633
+
634
+ db.delete(analysis)
635
+ deleted_count += 1
636
+
637
+ db.commit()
638
+
639
+ return {
640
+ "status": "success",
641
+ "deleted_count": deleted_count,
642
+ "errors": errors if errors else None
643
+ }
644
+
645
+
646
+ class DateRangeDeleteRequest(BaseModel):
647
+ start_date: str # YYYY-MM-DD
648
+ end_date: str # YYYY-MM-DD
649
+
650
+ @router.delete("/history/delete-range")
651
+ def delete_analyses_in_range(
652
+ request: DateRangeDeleteRequest,
653
+ current_user: User = Depends(get_current_user),
654
+ db: Session = Depends(get_db)
655
+ ):
656
+ """Delete all analyses within a date range."""
657
+ from datetime import datetime
658
+
659
+ try:
660
+ start = datetime.strptime(request.start_date, "%Y-%m-%d")
661
+ end = datetime.strptime(request.end_date, "%Y-%m-%d").replace(hour=23, minute=59, second=59)
662
+ except ValueError:
663
+ raise HTTPException(status_code=400, detail="Invalid date format. Use YYYY-MM-DD.")
664
+
665
+ # Find analyses in range
666
+ analyses = db.query(Analysis).filter(
667
+ Analysis.user_id == current_user.id,
668
+ Analysis.timestamp >= start,
669
+ Analysis.timestamp <= end
670
+ ).all()
671
+
672
+ deleted_count = 0
673
+ for analysis in analyses:
674
+ if analysis.stored_filename:
675
+ StorageService.delete_file(analysis.stored_filename)
676
+ db.delete(analysis)
677
+ deleted_count += 1
678
+
679
+ db.commit()
680
+
681
+ return {
682
+ "status": "success",
683
+ "deleted_count": deleted_count,
684
+ "date_range": f"{request.start_date} to {request.end_date}"
685
+ }
686
+
687
+ class SimulationRequest(BaseModel):
688
+ data: StandardizedDataPackage
689
+ delta_revenue: float = 0.0
690
+ delta_cogs: float = 0.0
691
+ delta_payroll: float = 0.0
692
+ delta_marketing: float = 0.0
693
+ delta_fixed_costs: float = 0.0
694
+
695
+ @router.post("/simulate", response_model=StandardizedDataPackage)
696
+ async def run_simulation(request: SimulationRequest, user: str = Depends(get_current_user)):
697
+ return SimulationService.run_simulation(
698
+ original_data=request.data.raw_data,
699
+ delta_revenue_percent=request.delta_revenue,
700
+ delta_cogs_percent=request.delta_cogs,
701
+ delta_payroll_percent=request.delta_payroll,
702
+ delta_marketing_percent=request.delta_marketing,
703
+ delta_fixed_costs_percent=request.delta_fixed_costs
704
+ )
705
+
706
+ @router.get("/history/{analysis_id}/export/pdf")
707
+ def export_analysis_pdf(
708
+ analysis_id: int,
709
+ current_user: User = Depends(get_current_user),
710
+ db: Session = Depends(get_db)
711
+ ):
712
+ from fastapi.responses import FileResponse
713
+
714
+ analysis = db.query(Analysis).filter(Analysis.id == analysis_id, Analysis.user_id == current_user.id).first()
715
+ if not analysis:
716
+ raise HTTPException(status_code=404, detail="Analysis not found")
717
+
718
+ # parse stored json
719
+ try:
720
+ data = StandardizedDataPackage.parse_raw(analysis.result_json)
721
+ except Exception as e:
722
+ raise HTTPException(status_code=500, detail=f"Data corruption: {str(e)}")
723
+
724
+ # Generate PDF
725
+ # We use /tmp for now, simplified
726
+ safe_name = "".join(x for x in data.raw_data.company_name if x.isalnum() or x in " _-")
727
+ filename = f"/tmp/{safe_name}_{analysis.id}_report.pdf"
728
+
729
+ PDFReporter.generate(data, filename)
730
+
731
+ from datetime import datetime
732
+ date_str = datetime.now().strftime("%Y-%m-%d")
733
+ return FileResponse(filename, media_type='application/pdf', filename=f"Visi-Insight Report - {data.raw_data.company_name} - {date_str}.pdf")
734
+
735
+ @router.post("/ai-cfo", response_model=str)
736
+ async def get_ai_summary(data: StandardizedDataPackage, user: str = Depends(get_current_user)):
737
+ from app.services.intelligence.ai_cfo import AICFOService
738
+ return AICFOService.generate_executive_summary(data)
739
+
740
+ @router.post("/chat", response_model=ChatResponse)
741
+ async def chat_with_data(request: ChatRequest, user: str = Depends(get_current_user)):
742
+ # Note: In a real app, 'data_context' would be retrieved from a session or vector DB
743
+ # For this stateless scaffold, we assume we want to query a mock global context or previously uploaded file.
744
+ # To keep it simple for the frontend demo, we will accept the data in the request or just mock the context access
745
+ # since we don't have a persistent session store implemented yet.
746
+
747
+ # Check if a file was recently uploaded (using a global for demo simplicity, or pass mock)
748
+ # Ideally, we'd pass the DataPackage in the request, but it's too big.
749
+ # We will instantiate a dummy context if none exists, or rely on client sending relevant context.
750
+
751
+ # PROPER IMPLEMENTATION:
752
+ # 1. User uploads file -> Backend stores Vector Index ID in User Session.
753
+ # 2. /chat -> retrieves Index ID -> Queries Vector DB.
754
+
755
+ # MOCK IMPLEMENTATION:
756
+ from app.schemas.financial import StandardizedDataPackage, FinancialReport, IncomeStatementStandard, BalanceSheetStandard, CashFlowStandard, KPIMetrics, RiskAnalysis, HealthScoreBreakdown
757
+ from datetime import date
758
+
759
+ # Create a dummy context for the scaffold to prove the endpoint works
760
+ # In production, this would be retrieved from session or vector DB
761
+ dummy_data = StandardizedDataPackage(
762
+ raw_data=FinancialReport(
763
+ company_name="Demo Corp",
764
+ period_end=date.today(),
765
+ income_statement=IncomeStatementStandard(revenue=1200000, net_income=240000, cogs=600000),
766
+ balance_sheet=BalanceSheetStandard(),
767
+ cash_flow=CashFlowStandard()
768
+ ),
769
+ kpis=KPIMetrics(net_margin=20.0),
770
+ risk_analysis=RiskAnalysis(risk_score=85, risk_factors=[], liquidity_risk="Low", solvency_risk="Low"),
771
+ health_score=HealthScoreBreakdown(stability=20, profitability=20, growth=20, efficiency=20, total_score=80),
772
+ insights=["Automated Report Generation Successful"],
773
+ optimization_insights=None # Should be populated normally
774
+ )
775
+
776
+ from app.services.intelligence.gemini_service import GeminiService
777
+ return GeminiService.query(request, dummy_data)
778
+
779
+ @router.get("/export/pptx/{company_name}")
780
+ async def export_pptx(company_name: str):
781
+ from fastapi.responses import FileResponse
782
+
783
+ dummy_data = StandardizedDataPackage(
784
+ raw_data=FinancialReport(
785
+ company_name=company_name,
786
+ period_end=date.today(),
787
+ income_statement=IncomeStatementStandard(revenue=1000000, net_income=200000, cogs=500000),
788
+ balance_sheet=BalanceSheetStandard(),
789
+ cash_flow=CashFlowStandard()
790
+ ),
791
+ kpis=KPIMetrics(net_margin=20.0),
792
+ risk_analysis=RiskAnalysis(risk_score=85, risk_factors=[], liquidity_risk="Low", solvency_risk="Low"),
793
+ health_score=HealthScoreBreakdown(stability=20, profitability=20, growth=20, efficiency=20, total_score=80),
794
+ insights=["Automated Report Generation Successful"]
795
+ )
796
+
797
+ filename = f"/tmp/{company_name}_presentation.pptx"
798
+ PPTXReporter.generate(dummy_data, filename)
799
+
800
+ return FileResponse(filename, media_type='application/vnd.openxmlformats-officedocument.presentationml.presentation', filename=f"{company_name}_presentation.pptx")
801
+
802
+ class EngineUpdate(BaseModel):
803
+ engine: str
804
+
805
+ @router.put("/admin/users/{user_id}/engine")
806
+ def update_user_engine(
807
+ user_id: int,
808
+ update: EngineUpdate,
809
+ admin: User = Depends(get_current_admin),
810
+ db: Session = Depends(get_db)
811
+ ):
812
+ user = db.query(User).filter(User.id == user_id).first()
813
+ if not user:
814
+ raise HTTPException(status_code=404, detail="User not found")
815
+
816
+ if update.engine not in ["v1", "v2"]:
817
+ raise HTTPException(status_code=400, detail="Invalid engine. Use 'v1' or 'v2'.")
818
+
819
+ user.preferred_engine = update.engine
820
+ db.commit()
821
+ return {"status": "success", "engine": user.preferred_engine}
822
+
823
+ @router.get("/public-config")
824
+ def get_public_config(db: Session = Depends(get_db)):
825
+ """Get configuration for Guest/Public users."""
826
+ from app.services.feature_service import get_effective_features
827
+ return {
828
+ "guest_features": get_effective_features(db, "Guest"),
829
+ "upload_limit": 2
830
+ }
app/api/visilok.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, Depends
2
+ from app.api.auth import get_current_user
3
+ from app.models.user import User
4
+ import os
5
+
6
+ router = APIRouter(prefix="/visilok", tags=["security"])
7
+
8
+ def get_current_admin(current_user: User = Depends(get_current_user)):
9
+ if not current_user.is_admin:
10
+ from fastapi import HTTPException
11
+ raise HTTPException(status_code=403, detail="Admin privileges required for Visi-Lok monitor.")
12
+ return current_user
13
+
14
+ @router.get("/status")
15
+ def get_security_status(admin: User = Depends(get_current_admin)):
16
+ """
17
+ Visi-Lok: Security & Auth Engine Monitor
18
+ Reports on the current encryption and security thresholds of the system.
19
+ """
20
+
21
+ # 1. Database Encryption
22
+ # In production environments like Render, PostgreSQL is encrypted at rest via AES-256.
23
+ # We verify if we are running in the cloud vs local.
24
+ is_production = os.getenv("RENDER") == "true" or os.getenv("NODE_ENV") == "production"
25
+
26
+ db_status = {
27
+ "encrypted_at_rest": is_production,
28
+ "algorithm": "AES-256" if is_production else "None (Local)",
29
+ "provider": "Render/AWS KMS" if is_production else "Local Disk"
30
+ }
31
+
32
+ # 2. Storage Encryption (Cloudflare R2 defaults to AES-256)
33
+ r2_configured = bool(os.getenv("R2_ACCOUNT_ID"))
34
+ storage_status = {
35
+ "encrypted_at_rest": r2_configured,
36
+ "algorithm": "AES-256" if r2_configured else "None",
37
+ "provider": "Cloudflare R2" if r2_configured else "Local Storage"
38
+ }
39
+
40
+ # 3. Transport Layer Security
41
+ # We enforce HTTPS via proxies in production
42
+ tls_status = {
43
+ "enforced": True, # Usually handled by Render/Vercel load balancers
44
+ "protocol": "TLS 1.3/1.2"
45
+ }
46
+
47
+ # 4. Auth & Row-Level Security
48
+ # Verified through Depends(get_current_user) and DB level filtering
49
+ auth_status = {
50
+ "row_level_security": "Active",
51
+ "jwt_encryption": True,
52
+ "password_hashing": "Argon2",
53
+ "mfa_enabled": False # Future roadmap
54
+ }
55
+
56
+ overall_secure = db_status["encrypted_at_rest"] and storage_status["encrypted_at_rest"]
57
+
58
+ return {
59
+ "system": "Visi-Lok Security Monitor",
60
+ "status": "SECURE" if overall_secure else "WARNING: UNENCRYPTED COMPONENTS DETECTED",
61
+ "checks": {
62
+ "database": db_status,
63
+ "object_storage": storage_status,
64
+ "transport": tls_status,
65
+ "authentication": auth_status
66
+ }
67
+ }
app/core/__init__.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Core Configuration Package
3
+
4
+ This package contains application-wide configuration and utilities.
5
+
6
+ ## Modules
7
+
8
+ - `config.py` - Environment variables and settings
9
+ - `database.py` - SQLAlchemy engine and session
10
+ - `security.py` - JWT token creation/validation
11
+ - `feature_registry.py` - Centralized feature definitions (auto-discoverable)
12
+ - `plan_config.py` - Plan limits and default feature sets
13
+
14
+ ## Feature System Architecture
15
+
16
+ The feature system uses a layered approach:
17
+
18
+ 1. **Feature Registry** (`feature_registry.py`)
19
+ - Defines ALL controllable features
20
+ - Features auto-appear in admin console
21
+ - Organized by category for easy navigation
22
+
23
+ 2. **Plan Config** (`plan_config.py`)
24
+ - Default features per plan tier
25
+ - Upload limits per plan
26
+ - Wildcard "*" for unlimited access
27
+
28
+ 3. **Admin Overrides** (via `models/feature_flags.py`)
29
+ - Stored in database
30
+ - Takes precedence over defaults
31
+ - Managed via admin API
32
+
33
+ ## Adding New Features
34
+
35
+ ```python
36
+ # In feature_registry.py, add to FEATURE_REGISTRY:
37
+ Feature(
38
+ id="new_feature_id",
39
+ name="New Feature Name",
40
+ description="What this feature does",
41
+ category=FeatureCategory.CORE_METRICS # Pick appropriate category
42
+ )
43
+ ```
44
+
45
+ The feature will automatically:
46
+ - Appear in admin console UI
47
+ - Be toggleable per plan
48
+ - Respect plan defaults until overridden
49
+ """
50
+
51
+ from app.core.feature_registry import FEATURE_REGISTRY, Feature, FeatureCategory
52
+ from app.core.plan_config import PLAN_DEFAULTS, get_plan_config, get_default_features
app/core/config.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic_settings import BaseSettings, SettingsConfigDict
2
+ from pydantic import field_validator
3
+ from typing import List, Union, Optional
4
+
5
+ class Settings(BaseSettings):
6
+ # Application Config
7
+ PROJECT_NAME: str = "Visique API"
8
+ VERSION: str = "0.1.0"
9
+ API_V1_STR: str = "/api/v1"
10
+
11
+ # Security
12
+ SECRET_KEY: str # Required in production
13
+ ALGORITHM: str = "HS256"
14
+ ACCESS_TOKEN_EXPIRE_MINUTES: int = 1440 # 24 hours for better UX
15
+
16
+ # Database
17
+ DATABASE_URL: str # PostgreSQL URL
18
+
19
+ # CORS
20
+ ALLOWED_ORIGINS: Union[List[str], str] = [
21
+ "http://localhost:3000",
22
+ "http://127.0.0.1:3000",
23
+ "https://visique-testing.vercel.app",
24
+ "https://visique-frontend.vercel.app"
25
+ ]
26
+
27
+ @field_validator("ALLOWED_ORIGINS", mode="before")
28
+ @classmethod
29
+ def assemble_cors_origins(cls, v: Union[str, List[str]]) -> Union[List[str], str]:
30
+ if isinstance(v, str) and not v.startswith("["):
31
+ return [i.strip() for i in v.split(",")]
32
+ elif isinstance(v, str) and v.startswith("["):
33
+ import json
34
+ return json.loads(v)
35
+ elif isinstance(v, list):
36
+ return v
37
+ raise ValueError(v)
38
+
39
+ # Stripe
40
+ STRIPE_SECRET_KEY: Optional[str] = None
41
+ STRIPE_PUBLISHABLE_KEY: Optional[str] = None
42
+ STRIPE_WEBHOOK_SECRET: Optional[str] = None
43
+
44
+ # Deployment
45
+ ENVIRONMENT: str = "development"
46
+
47
+ # Cloudflare R2 Storage
48
+ R2_ACCOUNT_ID: Optional[str] = None
49
+ R2_ACCESS_KEY_ID: Optional[str] = None
50
+ R2_SECRET_ACCESS_KEY: Optional[str] = None
51
+ R2_BUCKET_NAME: Optional[str] = None
52
+
53
+ # Dolphin PDF Extraction
54
+ DOLPHIN_MODEL_PATH: Optional[str] = None # Auto-downloads if None
55
+ DOLPHIN_DEVICE: str = "auto" # "auto" (CUDA > MPS > CPU) | "cuda" | "mps" | "cpu"
56
+ DOLPHIN_MAX_BATCH_SIZE: int = 4
57
+ DOLPHIN_AUTO_DOWNLOAD: bool = True
58
+
59
+ # Dolphin Remote Service (Optional - for distributed setup)
60
+ DOLPHIN_API_URL: Optional[str] = None
61
+ DOLPHIN_API_KEY: Optional[str] = None
62
+
63
+ # Cloudflare R2 Storage
64
+ R2_ACCOUNT_ID: Optional[str] = None
65
+ R2_ACCESS_KEY_ID: Optional[str] = None
66
+ R2_SECRET_ACCESS_KEY: Optional[str] = None
67
+ R2_BUCKET_NAME: Optional[str] = None
68
+
69
+ model_config = SettingsConfigDict(env_file=".env", case_sensitive=True, extra="ignore")
70
+
71
+ settings = Settings()
app/core/database.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sqlalchemy import create_engine
2
+ from sqlalchemy.ext.declarative import declarative_base
3
+ from sqlalchemy.orm import sessionmaker
4
+
5
+ from app.core.config import settings
6
+
7
+ SQLALCHEMY_DATABASE_URL = settings.DATABASE_URL
8
+
9
+ # Fix for Render/SQLAlchemy postgres:// scheme
10
+ if SQLALCHEMY_DATABASE_URL.startswith("postgres://"):
11
+ SQLALCHEMY_DATABASE_URL = SQLALCHEMY_DATABASE_URL.replace("postgres://", "postgresql://", 1)
12
+
13
+ # PostgreSQL-specific connect_args (keepalives break SQLite)
14
+ _is_sqlite = SQLALCHEMY_DATABASE_URL.startswith("sqlite")
15
+
16
+ _engine_kwargs: dict = dict(
17
+ pool_pre_ping=True,
18
+ )
19
+
20
+ if not _is_sqlite:
21
+ _engine_kwargs.update(
22
+ pool_recycle=280,
23
+ pool_size=5,
24
+ max_overflow=10,
25
+ connect_args={
26
+ "keepalives": 1,
27
+ "keepalives_idle": 30,
28
+ "keepalives_interval": 10,
29
+ "keepalives_count": 5,
30
+ "connect_timeout": 10,
31
+ },
32
+ )
33
+ else:
34
+ _engine_kwargs["connect_args"] = {"check_same_thread": False}
35
+
36
+ engine = create_engine(SQLALCHEMY_DATABASE_URL, **_engine_kwargs)
37
+ SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
38
+
39
+ Base = declarative_base()
40
+
41
+ def get_db():
42
+ db = SessionLocal()
43
+ try:
44
+ yield db
45
+ finally:
46
+ db.close()
app/core/feature_registry.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Feature Registry - Auto-Discoverable Feature System
3
+
4
+ Add new features here and they will automatically appear in the admin console.
5
+ Each feature belongs to a category and can be toggled per plan.
6
+ """
7
+
8
+ from enum import Enum
9
+ from dataclasses import dataclass, field
10
+ from typing import List, Dict, Optional
11
+
12
+
13
+ class FeatureCategory(Enum):
14
+ """Categories for organizing features in admin console"""
15
+ CORE_METRICS = "Core Metrics"
16
+ RISK_ANALYSIS = "Risk Analysis"
17
+ FORECASTING = "Forecasting"
18
+ AI_INTELLIGENCE = "AI Intelligence"
19
+ INTERACTIVE = "Interactive Tools"
20
+ EXPORTS = "Exports & Reports"
21
+
22
+
23
+ @dataclass
24
+ class Feature:
25
+ """
26
+ Represents a controllable feature in the system.
27
+
28
+ Attributes:
29
+ id: Unique identifier used in code and API
30
+ name: Human-readable name for admin console
31
+ category: Grouping category
32
+ description: Brief description of the feature
33
+ default_enabled: Whether enabled by default for new plans
34
+ """
35
+ id: str
36
+ name: str
37
+ category: FeatureCategory
38
+ description: str
39
+ default_enabled: bool = True
40
+ memory_cost_mb: int = 5 # Estimated RAM usage in MB
41
+
42
+
43
+
44
+ # =============================================================================
45
+ # FEATURE REGISTRY - ADD NEW FEATURES HERE
46
+ # =============================================================================
47
+ # When adding new financial model outputs, add a Feature entry below.
48
+ # It will automatically appear in the admin console under the correct category.
49
+ # =============================================================================
50
+
51
+ FEATURE_REGISTRY: List[Feature] = [
52
+ # -------------------------------------------------------------------------
53
+ # Core Metrics
54
+ # -------------------------------------------------------------------------
55
+ Feature(
56
+ id="kpi_margins",
57
+ name="Profit Margins (Gross/Operating/Net)",
58
+ category=FeatureCategory.CORE_METRICS,
59
+ description="Core margin KPIs from income statement",
60
+ memory_cost_mb=2
61
+ ),
62
+ Feature(
63
+ id="kpi_ratios",
64
+ name="Financial Ratios",
65
+ category=FeatureCategory.CORE_METRICS,
66
+ description="Current ratio, debt-to-equity, quick ratio",
67
+ memory_cost_mb=2
68
+ ),
69
+ Feature(
70
+ id="health_score",
71
+ name="Health Score Dashboard",
72
+ category=FeatureCategory.CORE_METRICS,
73
+ description="Overall financial health scoring (stability, profitability, growth, efficiency)"
74
+ ),
75
+
76
+ # -------------------------------------------------------------------------
77
+ # Risk Analysis
78
+ # -------------------------------------------------------------------------
79
+ Feature(
80
+ id="risk_score",
81
+ name="Risk Score",
82
+ category=FeatureCategory.RISK_ANALYSIS,
83
+ description="Aggregate risk scoring (0-100)",
84
+ memory_cost_mb=5
85
+ ),
86
+ Feature(
87
+ id="risk_factors",
88
+ name="Risk Factor Breakdown",
89
+ category=FeatureCategory.RISK_ANALYSIS,
90
+ description="Detailed list of identified risk factors"
91
+ ),
92
+ Feature(
93
+ id="liquidity_risk",
94
+ name="Liquidity Risk",
95
+ category=FeatureCategory.RISK_ANALYSIS,
96
+ description="Cash flow and working capital risk assessment"
97
+ ),
98
+ Feature(
99
+ id="solvency_risk",
100
+ name="Solvency Risk",
101
+ category=FeatureCategory.RISK_ANALYSIS,
102
+ description="Long-term debt sustainability analysis"
103
+ ),
104
+
105
+ # -------------------------------------------------------------------------
106
+ # Forecasting
107
+ # -------------------------------------------------------------------------
108
+ Feature(
109
+ id="runway_forecast",
110
+ name="Cash Runway Forecast",
111
+ category=FeatureCategory.FORECASTING,
112
+ description="30/60/90 day cash projections"
113
+ ),
114
+ Feature(
115
+ id="burn_rate",
116
+ name="Burn Rate Analysis",
117
+ category=FeatureCategory.FORECASTING,
118
+ description="Monthly cash burn rate calculation"
119
+ ),
120
+ Feature(
121
+ id="optimization_insights",
122
+ name="Optimization Insights",
123
+ category=FeatureCategory.FORECASTING,
124
+ description="Dead zones, peak premiums, cost optimization"
125
+ ),
126
+ Feature(
127
+ id="budget_variance",
128
+ name="Budget Variance Analysis",
129
+ category=FeatureCategory.FORECASTING,
130
+ description="Target vs actual comparison"
131
+ ),
132
+
133
+ # -------------------------------------------------------------------------
134
+ # AI Intelligence
135
+ # -------------------------------------------------------------------------
136
+ Feature(
137
+ id="ai_cfo",
138
+ name="AI CFO Chat",
139
+ category=FeatureCategory.AI_INTELLIGENCE,
140
+ description="Conversational AI financial advisor",
141
+ memory_cost_mb=80
142
+ ),
143
+ Feature(
144
+ id="ai_summary",
145
+ name="AI Executive Summary",
146
+ category=FeatureCategory.AI_INTELLIGENCE,
147
+ description="Auto-generated narrative insights",
148
+ memory_cost_mb=60
149
+ ),
150
+ Feature(
151
+ id="geo_insights",
152
+ name="Geo-Strategic Insights",
153
+ category=FeatureCategory.AI_INTELLIGENCE,
154
+ description="Location-based market analysis",
155
+ memory_cost_mb=150
156
+ ),
157
+ Feature(
158
+ id="intelligence_card",
159
+ name="Strategic Intelligence Card",
160
+ category=FeatureCategory.AI_INTELLIGENCE,
161
+ description="AI-powered strategic recommendations",
162
+ memory_cost_mb=50
163
+ ),
164
+
165
+ # -------------------------------------------------------------------------
166
+ # Interactive Tools
167
+ # -------------------------------------------------------------------------
168
+ Feature(
169
+ id="what_if_slider",
170
+ name="What-If Simulator",
171
+ category=FeatureCategory.INTERACTIVE,
172
+ description="Revenue/cost scenario modeling with sliders"
173
+ ),
174
+ Feature(
175
+ id="interactive_charts",
176
+ name="Interactive Charts",
177
+ category=FeatureCategory.INTERACTIVE,
178
+ description="Zoomable, hoverable data visualizations"
179
+ ),
180
+ Feature(
181
+ id="trend_comparison",
182
+ name="Trend Comparison",
183
+ category=FeatureCategory.INTERACTIVE,
184
+ description="Period-over-period analysis"
185
+ ),
186
+
187
+ # -------------------------------------------------------------------------
188
+ # Exports & Reports
189
+ # -------------------------------------------------------------------------
190
+ Feature(
191
+ id="pdf_export",
192
+ name="PDF Report Export",
193
+ category=FeatureCategory.EXPORTS,
194
+ description="Downloadable PDF financial report"
195
+ ),
196
+ Feature(
197
+ id="pptx_export",
198
+ name="PowerPoint Export",
199
+ category=FeatureCategory.EXPORTS,
200
+ description="Presentation-ready slides"
201
+ ),
202
+ Feature(
203
+ id="csv_export",
204
+ name="Data Export (CSV)",
205
+ category=FeatureCategory.EXPORTS,
206
+ description="Raw data download for further analysis"
207
+ ),
208
+
209
+ # -------------------------------------------------------------------------
210
+ # Data Integrity
211
+ # -------------------------------------------------------------------------
212
+ Feature(
213
+ id="visi_veritas",
214
+ name="Visi-Veritas Audit",
215
+ category=FeatureCategory.CORE_METRICS,
216
+ description="32-rule financial data validation engine with confidence scoring",
217
+ memory_cost_mb=2
218
+ ),
219
+ ]
220
+
221
+
222
+ # =============================================================================
223
+ # HELPER FUNCTIONS
224
+ # =============================================================================
225
+
226
+ def get_all_features() -> List[Feature]:
227
+ """Returns all registered features."""
228
+ return FEATURE_REGISTRY
229
+
230
+
231
+ def get_feature_by_id(feature_id: str) -> Optional[Feature]:
232
+ """Get a specific feature by its ID."""
233
+ for feature in FEATURE_REGISTRY:
234
+ if feature.id == feature_id:
235
+ return feature
236
+ return None
237
+
238
+
239
+ def get_all_feature_ids() -> List[str]:
240
+ """Returns list of all feature IDs."""
241
+ return [f.id for f in FEATURE_REGISTRY]
242
+
243
+
244
+ def get_features_by_category() -> Dict[str, List[Feature]]:
245
+ """Returns features grouped by category name."""
246
+ result: Dict[str, List[Feature]] = {}
247
+ for cat in FeatureCategory:
248
+ features = [f for f in FEATURE_REGISTRY if f.category == cat]
249
+ if features:
250
+ result[cat.value] = features
251
+ return result
252
+
253
+
254
+ def get_default_enabled_features() -> List[str]:
255
+ """Returns IDs of features enabled by default."""
256
+ return [f.id for f in FEATURE_REGISTRY if f.default_enabled]
257
+
258
+
259
+ def validate_feature_ids(feature_ids: List[str]) -> List[str]:
260
+ """
261
+ Validates a list of feature IDs against the registry.
262
+ Returns list of invalid IDs (empty if all valid).
263
+ """
264
+ valid_ids = set(get_all_feature_ids())
265
+ invalid = [fid for fid in feature_ids if fid not in valid_ids]
266
+ return invalid
app/core/migrations.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Automatic Schema Migration Utility
3
+
4
+ This module runs at startup to ensure database columns match the SQLAlchemy models.
5
+ It adds any missing columns automatically, preventing 'UndefinedColumn' errors
6
+ in production when new fields are added to models.
7
+ """
8
+
9
+ from sqlalchemy import inspect, text
10
+ from sqlalchemy.engine import Engine
11
+ import logging
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def get_model_columns(model_class):
17
+ """Extract column definitions from a SQLAlchemy model class."""
18
+ from sqlalchemy import Column
19
+ columns = {}
20
+ for attr_name in dir(model_class):
21
+ attr = getattr(model_class, attr_name, None)
22
+ if hasattr(attr, 'property') and hasattr(attr.property, 'columns'):
23
+ col = attr.property.columns[0]
24
+ columns[col.name] = col
25
+ return columns
26
+
27
+
28
+ def get_db_columns(engine: Engine, table_name: str):
29
+ """Get existing column names from the database table."""
30
+ inspector = inspect(engine)
31
+ try:
32
+ return {col['name'] for col in inspector.get_columns(table_name)}
33
+ except Exception:
34
+ return set()
35
+
36
+
37
+ def get_column_type_sql(column):
38
+ """Convert SQLAlchemy column type to SQL type string."""
39
+ from sqlalchemy import Boolean, Integer, String, DateTime, Text, Float, JSON
40
+
41
+ col_type = type(column.type)
42
+
43
+ type_map = {
44
+ Boolean: "BOOLEAN",
45
+ Integer: "INTEGER",
46
+ String: "VARCHAR(255)",
47
+ DateTime: "TIMESTAMP",
48
+ Text: "TEXT",
49
+ Float: "FLOAT",
50
+ JSON: "JSONB" # PostgreSQL JSON type
51
+ }
52
+
53
+ # Check for String with specific length
54
+ if hasattr(column.type, 'length') and column.type.length:
55
+ return f"VARCHAR({column.type.length})"
56
+
57
+ return type_map.get(col_type, "TEXT")
58
+
59
+
60
+ def get_default_sql(column):
61
+ """Get SQL DEFAULT clause for a column."""
62
+ if column.default is not None:
63
+ default_val = column.default.arg
64
+ if isinstance(default_val, bool):
65
+ return "DEFAULT FALSE" if not default_val else "DEFAULT TRUE"
66
+ elif isinstance(default_val, (int, float)):
67
+ return f"DEFAULT {default_val}"
68
+ elif isinstance(default_val, str):
69
+ return f"DEFAULT '{default_val}'"
70
+ elif isinstance(default_val, dict):
71
+ return "DEFAULT '{}'"
72
+ return ""
73
+
74
+
75
+ def run_migrations(engine: Engine):
76
+ """
77
+ Check all models and add any missing columns to the database.
78
+ This runs at application startup.
79
+ """
80
+ from app.models.user import User, Analysis, Payment
81
+ from app.models.feature_flags import PlanFeatureOverride, PlanUploadLimit
82
+
83
+ models = [User, Analysis, Payment, PlanFeatureOverride, PlanUploadLimit]
84
+
85
+ for model in models:
86
+ table_name = model.__tablename__
87
+ model_cols = get_model_columns(model)
88
+ db_cols = get_db_columns(engine, table_name)
89
+
90
+ if not db_cols:
91
+ # Table doesn't exist yet, let create_all handle it
92
+ logger.info(f"Table '{table_name}' not found, will be created by create_all()")
93
+ continue
94
+
95
+ missing_cols = set(model_cols.keys()) - db_cols
96
+
97
+ for col_name in missing_cols:
98
+ col = model_cols[col_name]
99
+ col_type = get_column_type_sql(col)
100
+ default_clause = get_default_sql(col)
101
+
102
+ sql = f'ALTER TABLE {table_name} ADD COLUMN {col_name} {col_type} {default_clause}'
103
+
104
+ try:
105
+ with engine.connect() as conn:
106
+ conn.execute(text(sql))
107
+ conn.commit()
108
+ logger.info(f"✓ Added column '{col_name}' to table '{table_name}'")
109
+ except Exception as e:
110
+ # Column might already exist or other issue
111
+ logger.warning(f"Could not add column '{col_name}' to '{table_name}': {e}")
app/core/plan_config.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Plan Configuration - Default settings for each subscription plan.
3
+
4
+ This module defines upload limits and default feature access per plan.
5
+ Admins can override these defaults via the admin console.
6
+ """
7
+
8
+ from typing import Dict, List, Any
9
+ from .feature_registry import get_all_feature_ids
10
+
11
+
12
+ # =============================================================================
13
+ # PLAN CONFIGURATION
14
+ # =============================================================================
15
+ # Each plan has:
16
+ # - upload_limit: Monthly upload cap
17
+ # - is_session: True for guest/anonymous (session-based tracking)
18
+ # - features: List of enabled feature IDs, or ["*"] for all features
19
+ # =============================================================================
20
+
21
+ PLAN_DEFAULTS: Dict[str, Dict[str, Any]] = {
22
+ # Guest users on /try page (session-based, no account)
23
+ "Guest": {
24
+ "upload_limit": 2,
25
+ "is_session": True,
26
+ "features": [
27
+ "kpi_margins",
28
+ "health_score",
29
+ "risk_score",
30
+ "pdf_export"
31
+ ]
32
+ },
33
+
34
+ # Free trial - full Small Business experience for 1 month
35
+ "Free Trial": {
36
+ "upload_limit": 15,
37
+ "is_session": False,
38
+ "features": [
39
+ "kpi_margins",
40
+ "kpi_ratios",
41
+ "health_score",
42
+ "risk_score",
43
+ "risk_factors",
44
+ "runway_forecast",
45
+ "burn_rate",
46
+ "interactive_charts",
47
+ "pdf_export"
48
+ ]
49
+ },
50
+
51
+ # Individual plan - $9/month
52
+ "Individual": {
53
+ "upload_limit": 5,
54
+ "is_session": False,
55
+ "features": [
56
+ "kpi_margins",
57
+ "kpi_ratios",
58
+ "health_score",
59
+ "risk_score",
60
+ "risk_factors",
61
+ "pdf_export"
62
+ ]
63
+ },
64
+
65
+ # Organization plan - $49/month
66
+ "Organization": {
67
+ "upload_limit": 10,
68
+ "is_session": False,
69
+ "features": [
70
+ "kpi_margins",
71
+ "kpi_ratios",
72
+ "health_score",
73
+ "risk_score",
74
+ "risk_factors",
75
+ "liquidity_risk",
76
+ "runway_forecast",
77
+ "ai_summary",
78
+ "interactive_charts",
79
+ "pdf_export"
80
+ ]
81
+ },
82
+
83
+ # Small Business plan - $99/month
84
+ "Small Business": {
85
+ "upload_limit": 15,
86
+ "is_session": False,
87
+ "features": ["*"] # All features
88
+ },
89
+
90
+ # Mid Business plan - $249/month
91
+ "Mid Business": {
92
+ "upload_limit": 25,
93
+ "is_session": False,
94
+ "features": ["*"] # All features
95
+ },
96
+
97
+ # Large Business / Enterprise - $499+/month
98
+ "Large Business": {
99
+ "upload_limit": 50,
100
+ "is_session": False,
101
+ "features": ["*"] # All features
102
+ },
103
+
104
+ # Admin users - unlimited access
105
+ "Admin": {
106
+ "upload_limit": 999999,
107
+ "is_session": False,
108
+ "features": ["*"]
109
+ },
110
+
111
+ # Engine Configs (Treated as Plans for feature flags)
112
+ "_ENGINE_v1": {
113
+ "upload_limit": 0,
114
+ "is_session": False,
115
+ "features": ["*"]
116
+ },
117
+ "_ENGINE_v2": {
118
+ "upload_limit": 0,
119
+ "is_session": False,
120
+ "features": [
121
+ "kpi_margins", "kpi_ratios", "health_score", "risk_score", "risk_factors",
122
+ "runway_forecast", "burn_rate", "interactive_charts", "pdf_export",
123
+ "ai_summary", "intelligence_card"
124
+ # Note: Geo Insights and AI CFO omitted by default for Lite Engine
125
+ ]
126
+ }
127
+ }
128
+
129
+
130
+
131
+ # Special "Plan" names for Engine Feature Configuration
132
+ ENGINE_PLANS = ["_ENGINE_v1", "_ENGINE_v2"]
133
+
134
+ # Mappings for UI display
135
+ ENGINE_DISPLAY_NAMES = {
136
+ "_ENGINE_v1": "Visi-Insight-1 (Standard)",
137
+ "_ENGINE_v2": "Visi-Insight-2 (Lite)"
138
+ }
139
+
140
+
141
+ # =============================================================================
142
+ # HELPER FUNCTIONS
143
+ # =============================================================================
144
+
145
+ def get_plan_config(plan_name: str) -> Dict[str, Any]:
146
+ """
147
+ Get configuration for a specific plan.
148
+ Falls back to Individual if plan not found.
149
+ """
150
+ return PLAN_DEFAULTS.get(plan_name, PLAN_DEFAULTS["Individual"])
151
+
152
+
153
+ def get_upload_limit(plan_name: str) -> int:
154
+ """Get the monthly upload limit for a plan."""
155
+ config = get_plan_config(plan_name)
156
+ return config.get("upload_limit", 5)
157
+
158
+
159
+ def get_default_features(plan_name: str) -> List[str]:
160
+ """
161
+ Get list of enabled feature IDs for a plan.
162
+ Expands ["*"] to all feature IDs.
163
+ """
164
+ config = get_plan_config(plan_name)
165
+ features = config.get("features", [])
166
+
167
+ if "*" in features:
168
+ return get_all_feature_ids()
169
+
170
+ return features
171
+
172
+
173
+ def is_session_based(plan_name: str) -> bool:
174
+ """Check if plan uses session-based tracking (for guests)."""
175
+ config = get_plan_config(plan_name)
176
+ return config.get("is_session", False)
177
+
178
+
179
+ def get_all_plans() -> List[str]:
180
+ """Returns list of all plan names."""
181
+ return list(PLAN_DEFAULTS.keys())
182
+
183
+
184
+ def get_billable_plans() -> List[str]:
185
+ """Returns plans that are actual subscription tiers (excludes Guest/Admin)."""
186
+ return [p for p in PLAN_DEFAULTS.keys() if p not in ("Guest", "Admin")]
187
+
188
+
189
+ def get_all_engines() -> List[str]:
190
+ """Returns list of engine identifier keys."""
191
+ return ENGINE_PLANS
192
+
app/core/security.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime, timedelta
2
+ from typing import Optional
3
+ from jose import JWTError, jwt
4
+ from passlib.context import CryptContext
5
+ from app.core.config import settings
6
+
7
+ # Config
8
+ SECRET_KEY = settings.SECRET_KEY
9
+ ALGORITHM = settings.ALGORITHM
10
+ ACCESS_TOKEN_EXPIRE_MINUTES = settings.ACCESS_TOKEN_EXPIRE_MINUTES
11
+
12
+ pwd_context = CryptContext(schemes=["argon2"], deprecated="auto")
13
+
14
+ def verify_password(plain_password, hashed_password):
15
+ return pwd_context.verify(plain_password, hashed_password)
16
+
17
+ def get_password_hash(password):
18
+ return pwd_context.hash(password)
19
+
20
+ def create_access_token(data: dict, expires_delta: Optional[timedelta] = None):
21
+ to_encode = data.copy()
22
+ if expires_delta:
23
+ expire = datetime.utcnow() + expires_delta
24
+ else:
25
+ expire = datetime.utcnow() + timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
26
+ to_encode.update({"exp": expire})
27
+ encoded_jwt = jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM)
28
+ return encoded_jwt
app/core/stripe_config.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import stripe
2
+ from app.core.config import settings
3
+
4
+ stripe.api_key = settings.STRIPE_SECRET_KEY
5
+
6
+ def create_checkout_session(db_user, plan_id: str):
7
+ try:
8
+ checkout_session = stripe.checkout.Session.create(
9
+ customer_email=db_user.email,
10
+ client_reference_id=str(db_user.id),
11
+ payment_method_types=['card'],
12
+ line_items=[
13
+ {
14
+ 'price': plan_id,
15
+ 'quantity': 1,
16
+ },
17
+ ],
18
+ mode='subscription',
19
+ success_url=f"{settings.ALLOWED_ORIGINS[0]}/dashboard?session_id={{CHECKOUT_SESSION_ID}}",
20
+ cancel_url=f"{settings.ALLOWED_ORIGINS[0]}/pricing",
21
+ metadata={
22
+ 'user_id': db_user.id,
23
+ 'plan_name': 'Business' # Or derive from plan_id
24
+ }
25
+ )
26
+ return checkout_session
27
+ except Exception as e:
28
+ print(f"Stripe Error: {e}")
29
+ return None
app/main.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+
4
+ app = FastAPI(
5
+ title="Visique API",
6
+ description="Financial Data Analyzer Backend",
7
+ version="0.1.2" # Bump version to clear previous failure
8
+ )
9
+
10
+ from app.core.config import settings
11
+
12
+ # CORS Configuration
13
+ # Ensure Vercel domains are allowed even if env vars override config defaults
14
+ origins = []
15
+ if isinstance(settings.ALLOWED_ORIGINS, list):
16
+ origins.extend(settings.ALLOWED_ORIGINS)
17
+ else:
18
+ origins.append(str(settings.ALLOWED_ORIGINS))
19
+
20
+ extra_origins = [
21
+ "https://visique-testing.vercel.app",
22
+ "https://visique-frontend.vercel.app",
23
+ # Specific current previews
24
+ "https://visique-testing-7qdi0vaqf-sams-projects-85f65c65.vercel.app",
25
+ "https://visique-testing-fky1isli2-sams-projects-85f65c65.vercel.app"
26
+ ]
27
+
28
+ for origin in extra_origins:
29
+ if origin not in origins:
30
+ origins.append(origin)
31
+
32
+ app.add_middleware(
33
+ CORSMiddleware,
34
+ allow_origins=origins,
35
+ # Allow any Vercel preview domain for this specific project
36
+ allow_origin_regex=r"https://visique-testing-.*-sams-projects-85f65c65\.vercel\.app",
37
+ allow_credentials=True,
38
+ allow_methods=["*"],
39
+ allow_headers=["*"],
40
+ )
41
+
42
+ @app.get("/")
43
+ async def root():
44
+ return {"message": "Welcome to Visique Financial Analyzer API"}
45
+
46
+ @app.get("/health")
47
+ async def health_check():
48
+ return {"status": "healthy"}
49
+
50
+ from app.api.endpoints import router as analysis_router
51
+ from app.api.auth import router as auth_router
52
+ from app.core.database import engine, Base
53
+
54
+ # Run Automatic Schema Migrations (adds missing columns)
55
+ from app.core.migrations import run_migrations
56
+ run_migrations(engine)
57
+
58
+ # Create Tables (for new tables only, migrations handles columns)
59
+ Base.metadata.create_all(bind=engine)
60
+
61
+ app.include_router(analysis_router, prefix="/api/v1")
62
+ app.include_router(auth_router, prefix="/api/v1")
63
+
64
+ from app.api.admin import router as admin_router
65
+ app.include_router(admin_router, prefix="/api/v1")
66
+
67
+ from app.api.visilok import router as visilok_router
68
+ app.include_router(visilok_router, prefix="/api/v1")
69
+
70
+ # Mount Static Files for Uploads
71
+ from fastapi.staticfiles import StaticFiles
72
+ import os
73
+
74
+ # Ensure upload directory exists
75
+ upload_dir = "uploads"
76
+ if not os.path.exists(upload_dir):
77
+ os.makedirs(upload_dir)
78
+
79
+ # Mount /api/v1/static to the uploads directory
80
+ app.mount("/api/v1/static", StaticFiles(directory="uploads"), name="static")
81
+
82
+ from sqlalchemy import text
83
+ from app.core.database import SessionLocal
84
+
85
+ # Startup Migration for V2 Engine Support
86
+ @app.on_event("startup")
87
+ def run_migrations():
88
+ try:
89
+ db = SessionLocal()
90
+ # Add preferred_engine column if it doesn't exist
91
+ db.execute(text("ALTER TABLE users ADD COLUMN IF NOT EXISTS preferred_engine VARCHAR DEFAULT 'v1'"))
92
+ db.commit()
93
+ db.close()
94
+ print("Startup Migration: Verified preferred_engine column.")
95
+ except Exception as e:
96
+ print(f"Startup Migration Warning: {e}")
97
+
98
+ # Keep-Alive Background Task to prevent Render free tier from sleeping
99
+ import asyncio
100
+ import httpx
101
+
102
+ async def keep_alive_task():
103
+ """Pings the health endpoint every 5 minutes to prevent cold starts."""
104
+ # Wait for initial startup to complete
105
+ await asyncio.sleep(60)
106
+
107
+ # Get the app URL from environment or use default
108
+ app_url = os.environ.get("RENDER_EXTERNAL_URL", "https://visique-backend.onrender.com")
109
+ health_url = f"{app_url}/health"
110
+
111
+ print(f"[Keep-Alive] Started. Pinging {health_url} every 5 minutes.")
112
+
113
+ async with httpx.AsyncClient() as client:
114
+ while True:
115
+ try:
116
+ response = await client.get(health_url, timeout=30)
117
+ print(f"[Keep-Alive] Ping successful: {response.status_code}")
118
+ except Exception as e:
119
+ print(f"[Keep-Alive] Ping failed: {e}")
120
+
121
+ # Wait 5 minutes (300 seconds) before next ping
122
+ await asyncio.sleep(300)
123
+
124
+ @app.on_event("startup")
125
+ async def start_keep_alive():
126
+ """Starts the keep-alive background task on app startup."""
127
+ asyncio.create_task(keep_alive_task())
app/models/feature_flags.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Feature Flags Model - Admin-managed feature overrides per plan.
3
+
4
+ This model stores per-plan feature overrides that take precedence
5
+ over the defaults defined in plan_config.py.
6
+ """
7
+
8
+ from sqlalchemy import Column, Integer, String, Boolean, DateTime, ForeignKey
9
+ from sqlalchemy.orm import relationship
10
+ from datetime import datetime
11
+ from app.core.database import Base
12
+
13
+
14
+ class PlanFeatureOverride(Base):
15
+ """
16
+ Stores admin overrides for feature availability per plan.
17
+
18
+ When checking if a feature is enabled for a plan:
19
+ 1. Check if override exists in this table
20
+ 2. If yes, use the override value
21
+ 3. If no, fall back to plan_config.py defaults
22
+ """
23
+ __tablename__ = "plan_feature_overrides"
24
+
25
+ id = Column(Integer, primary_key=True, index=True)
26
+ plan_name = Column(String, index=True, nullable=False)
27
+ feature_id = Column(String, index=True, nullable=False)
28
+ enabled = Column(Boolean, default=True, nullable=False)
29
+
30
+ # Audit fields
31
+ updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
32
+ updated_by_id = Column(Integer, ForeignKey("users.id"), nullable=True)
33
+
34
+ def __repr__(self):
35
+ status = "enabled" if self.enabled else "disabled"
36
+ return f"<PlanFeatureOverride {self.plan_name}:{self.feature_id}={status}>"
37
+
38
+
39
+ class PlanUploadLimit(Base):
40
+ """
41
+ Stores admin overrides for upload limits per plan.
42
+
43
+ When checking upload limit for a plan:
44
+ 1. Check if override exists in this table
45
+ 2. If yes, use the override value
46
+ 3. If no, fall back to plan_config.py defaults
47
+ """
48
+ __tablename__ = "plan_upload_limits"
49
+
50
+ id = Column(Integer, primary_key=True, index=True)
51
+ plan_name = Column(String, unique=True, index=True, nullable=False)
52
+ upload_limit = Column(Integer, nullable=False)
53
+
54
+ # Audit fields
55
+ updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
56
+ updated_by_id = Column(Integer, ForeignKey("users.id"), nullable=True)
57
+
58
+ def __repr__(self):
59
+ return f"<PlanUploadLimit {self.plan_name}={self.upload_limit}>"
app/models/user.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlalchemy
2
+ from sqlalchemy import Column, Integer, String, DateTime, ForeignKey, Text, Boolean
3
+ from sqlalchemy.orm import relationship
4
+ from datetime import datetime
5
+ from app.core.database import Base
6
+
7
+ class User(Base):
8
+ __tablename__ = "users"
9
+
10
+ id = Column(Integer, primary_key=True, index=True)
11
+ email = Column(String, unique=True, index=True)
12
+ hashed_password = Column(String)
13
+ full_name = Column(String, nullable=True)
14
+ company_name = Column(String, nullable=True)
15
+ plan = Column(String, default="Free")
16
+ plan_expires_at = Column(DateTime, nullable=True)
17
+ is_admin = Column(Boolean, default=False)
18
+ is_super_admin = Column(Boolean, default=False)
19
+ created_at = Column(DateTime, default=datetime.utcnow)
20
+
21
+ # New Fields for Verification & Profile
22
+ visique_id = Column(String, unique=True, index=True, nullable=True) # Generated VSQ-XXXX
23
+ ein = Column(String, nullable=True)
24
+ address = Column(String, nullable=True)
25
+ profile_picture_url = Column(String, nullable=True)
26
+ industry = Column(String, default="General")
27
+ preferred_engine = Column(String, default="v1") # "v1" (Standard) or "v2" (Lite)
28
+
29
+ # Upload Tracking
30
+ monthly_upload_count = Column(Integer, default=0)
31
+ upload_reset_date = Column(DateTime, default=datetime.utcnow)
32
+
33
+ # Custom User-Level Feature Overrides (Add-ons)
34
+ custom_features = Column(sqlalchemy.JSON, default={}) # Stores { feature_id: bool }
35
+
36
+ analyses = relationship("Analysis", back_populates="owner", cascade="all, delete-orphan")
37
+ payments = relationship("Payment", back_populates="user", cascade="all, delete-orphan")
38
+
39
+ class Analysis(Base):
40
+ __tablename__ = "analyses"
41
+
42
+ id = Column(Integer, primary_key=True, index=True)
43
+ user_id = Column(Integer, ForeignKey("users.id"))
44
+ timestamp = Column(DateTime, default=datetime.utcnow)
45
+ company_name = Column(String)
46
+ input_filename = Column(String)
47
+ stored_filename = Column(String) # Path to saved file on disk
48
+ result_json = Column(Text)
49
+
50
+ owner = relationship("User", back_populates="analyses")
51
+
52
+ class Payment(Base):
53
+ __tablename__ = "payments"
54
+
55
+ id = Column(Integer, primary_key=True, index=True)
56
+ user_id = Column(Integer, ForeignKey("users.id"))
57
+ amount = Column(Integer) # In cents or dollars? Let's assume dollars as float or integer cents. Implementation plan said float, but explicit Integer is safer for cents. Let's stick to Float for simplicity with display, or String. Plan said 'amount (float)'. Let's use Float.
58
+ status = Column(String) # paid, pending, overdue
59
+ date = Column(DateTime, default=datetime.utcnow)
60
+ plan_name = Column(String)
61
+ invoice_pdf = Column(String, nullable=True) # Path to invoice file
62
+
63
+ user = relationship("User", back_populates="payments")
app/schemas/chat.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel
2
+ from typing import List, Optional
3
+
4
+ class Message(BaseModel):
5
+ role: str # "user" or "assistant"
6
+ content: str
7
+
8
+ class ChatRequest(BaseModel):
9
+ messages: List[Message]
10
+ context_filter: Optional[str] = None # e.g. "Balance Sheet", "Risk Report"
11
+
12
+ class ChatResponse(BaseModel):
13
+ response: str
14
+ sources: List[str] = [] # Citations or references to specific data points
app/schemas/financial.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from pathlib import Path
3
+
4
+ # Dynamic Path Resolution for 'financial_model' library
5
+ # Structure: root/visique/backend/app/schemas/financial.py -> root/financial_model
6
+ # To import 'financial_model' as a package, we need to add 'root' to sys.path
7
+ try:
8
+ current_file = Path(__file__).resolve()
9
+ # Go up 4 levels to 'visique' (backend/app/schemas/financial.py -> schemas -> app -> backend -> visique)
10
+ # Then up 1 more to root?
11
+ # current_file.parents[0] = schemas
12
+ # current_file.parents[1] = app
13
+ # current_file.parents[2] = backend
14
+ # current_file.parents[3] = visique
15
+ # current_file.parents[4] = root (TestAntigrav)
16
+
17
+ project_root = current_file.parents[4]
18
+
19
+ # Check if 'financial_model' exists in this root
20
+ if (project_root / "financial_model").exists():
21
+ if str(project_root) not in sys.path:
22
+ sys.path.insert(0, str(project_root))
23
+ else:
24
+ # Fallback for different execution contexts
25
+ cwd = Path.cwd()
26
+ if (cwd / "financial_model").exists():
27
+ if str(cwd) not in sys.path: sys.path.insert(0, str(cwd))
28
+ elif (cwd.parent.parent / "financial_model").exists():
29
+ unique_root = str(cwd.parent.parent)
30
+ if unique_root not in sys.path: sys.path.insert(0, unique_root)
31
+
32
+ except Exception as e:
33
+ pass # Handle gracefully
34
+
35
+ try:
36
+ # Now import from the PACKAGE "financial_model"
37
+ from financial_model.models import (
38
+ PeriodType, Currency,
39
+ IncomeStatementStandard, BalanceSheetStandard, CashFlowStandard, OperatingMetrics,
40
+ DocumentClassification,
41
+ FinancialReport, KPIMetrics, BudgetModel, VarianceAnalysis, RiskAnalysis,
42
+ HealthScoreBreakdown, GeoAnalysis, RunwayForecast, OptimizationInsight,
43
+ StandardizedDataPackage, VisiVeritasReport
44
+ )
45
+ except ImportError:
46
+ print("WARNING: Could not import from financial_model library. Ensure project root is in PYTHONPATH.")
47
+ raise
app/schemas/user.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, EmailStr
2
+ from typing import Optional, List
3
+ from datetime import datetime
4
+
5
+ class UserBase(BaseModel):
6
+ email: str # was EmailStr
7
+
8
+ class UserCreate(UserBase):
9
+ password: str
10
+ full_name: Optional[str] = None
11
+ company_name: Optional[str] = None
12
+ admin_key: Optional[str] = None
13
+
14
+ class UserLogin(UserBase):
15
+ password: str
16
+
17
+ class UserResponse(UserBase):
18
+ id: int
19
+ full_name: Optional[str] = None
20
+ company_name: Optional[str] = None
21
+ plan: str = "Free"
22
+ plan_expires_at: Optional[datetime] = None
23
+ is_admin: bool = False
24
+ is_super_admin: bool = False
25
+ created_at: datetime
26
+
27
+ # New Fields
28
+ visique_id: Optional[str] = None
29
+ ein: Optional[str] = None
30
+ address: Optional[str] = None
31
+ profile_picture_url: Optional[str] = None
32
+ industry: Optional[str] = None
33
+ preferred_engine: Optional[str] = "v1"
34
+ custom_features: Optional[dict] = None # JSON feature overrides
35
+
36
+ class Config:
37
+ from_attributes = True
38
+
39
+ class Token(BaseModel):
40
+ access_token: str
41
+ token_type: str
42
+
43
+
44
+
45
+ class TokenData(BaseModel):
46
+ email: Optional[str] = None
47
+
48
+ class AnalysisBase(BaseModel):
49
+ company_name: str
50
+ input_filename: str
51
+ timestamp: datetime
52
+ # result_json is heavy, maybe separate detail view
53
+
54
+ class AnalysisResponse(AnalysisBase):
55
+ id: int
56
+ user_id: int
57
+
58
+ class Config:
59
+ from_attributes = True
60
+
61
+ class UpgradeRequest(BaseModel):
62
+ plan_name: str
63
+ amount: float = 0.0
64
+ card_number: str
65
+ expiry: str
66
+ cvv: str
67
+ # New Checkout Fields
68
+ address: Optional[str] = None
69
+ ein: Optional[str] = None
70
+
71
+ class PaymentBase(BaseModel):
72
+ amount: float
73
+ status: str
74
+ plan_name: str
75
+ date: datetime
76
+
77
+ class PaymentResponse(PaymentBase):
78
+ id: int
79
+ invoice_pdf: Optional[str] = None
80
+
81
+ class Config:
82
+ from_attributes = True
app/services/__init__.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Services Layer
3
+
4
+ This package contains all business logic for the Visique platform.
5
+
6
+ ## Module Index
7
+
8
+ - `feature_service` - Feature flag resolution and plan management
9
+ - `analysis/` - Financial analysis and calculations
10
+ - `ingestion/` - Data parsing (CSV, PDF)
11
+ - `intelligence/` - AI-powered features (Gemini, RAG)
12
+ - `reporting/` - Report generation (PDF, PPTX)
13
+
14
+ ## Usage Pattern
15
+
16
+ ```python
17
+ from app.services.feature_service import get_effective_features, check_upload_limit
18
+ from app.services.analysis.fundamental import FundamentalAnalyzer
19
+ from app.services.intelligence.gemini_service import GeminiService
20
+ ```
21
+
22
+ ## Design Principles
23
+
24
+ 1. **Stateless**: Services don't hold state between calls
25
+ 2. **Testable**: All dependencies injected as parameters
26
+ 3. **Single Purpose**: Each module handles one domain
27
+ 4. **Error Handling**: Raise specific exceptions, don't swallow errors
28
+ """
29
+
30
+ # Re-export commonly used functions for convenience
31
+ from app.services.feature_service import (
32
+ get_effective_features,
33
+ check_upload_limit,
34
+ increment_upload_count,
35
+ get_effective_upload_limit,
36
+ )
app/services/analysis/__init__.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Financial Analysis Services
3
+
4
+ This package contains the core financial analysis logic.
5
+
6
+ ## Module Responsibilities
7
+
8
+ | Module | Purpose | Key Functions |
9
+ |--------|---------|---------------|
10
+ | `fundamental.py` | Main orchestrator | `FundamentalAnalyzer.analyze()` |
11
+ | `kpi.py` | KPI calculations | `calculate_margins()`, `calculate_ratios()` |
12
+ | `risk.py` | Risk assessment | `calculate_risk_score()`, `identify_risk_factors()` |
13
+ | `health_score.py` | Overall health | `compute_health_score()` |
14
+ | `growth.py` | Growth metrics | `calculate_growth_rates()` |
15
+ | `simulation.py` | What-if modeling | `simulate_scenario()` |
16
+
17
+ ## Data Flow
18
+
19
+ ```
20
+ Raw Data (CSV/PDF)
21
+
22
+ Ingestion Layer (parsed dict)
23
+
24
+ FundamentalAnalyzer.analyze()
25
+ ├── KPI Calculator
26
+ ├── Risk Analyzer
27
+ ├── Health Score
28
+ ├── Growth Metrics
29
+ └── (optional) AI Enrichment
30
+
31
+ StandardizedDataPackage
32
+ ```
33
+
34
+ ## Usage
35
+
36
+ ```python
37
+ from app.services.analysis.fundamental import FundamentalAnalyzer
38
+
39
+ analyzer = FundamentalAnalyzer()
40
+ result = await analyzer.analyze(parsed_data, user, filename)
41
+ # result is a StandardizedDataPackage (Pydantic model)
42
+ ```
43
+
44
+ ## Adding New Analysis Modules
45
+
46
+ 1. Create new file in this directory (e.g., `budget.py`)
47
+ 2. Define calculation functions with type hints
48
+ 3. Import and call from `FundamentalAnalyzer.analyze()`
49
+ 4. Add result to `StandardizedDataPackage` schema
50
+ 5. (Optional) Register as feature in `feature_registry.py`
51
+ """
52
+
53
+ # Re-export main analyzer for convenience
54
+ from app.services.analysis.fundamental import FundamentalAnalyzer
app/services/analysis/engine_lite.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from pathlib import Path
3
+ from typing import List, Optional, Dict, Any
4
+ from app.schemas.financial import FinancialReport, BudgetModel
5
+
6
+ # Ensure path to financial_model
7
+ try:
8
+ current_file = Path(__file__).resolve()
9
+ project_root = current_file.parents[4]
10
+ if (project_root / "financial_model").exists():
11
+ if str(project_root) not in sys.path:
12
+ sys.path.insert(0, str(project_root))
13
+ except Exception:
14
+ pass
15
+
16
+ try:
17
+ from financial_model.core import FinancialAnalyzer
18
+ except ImportError:
19
+ # Fallback
20
+ sys.path.insert(0, "../../../../../")
21
+ from financial_model.core import FinancialAnalyzer
22
+
23
+ class LiteAnalyzer:
24
+ """
25
+ Visi-Insight-2 (Lite Engine)
26
+ Optimized for memory-constrained environments.
27
+ - No External API calls (GeoService removed)
28
+ - No Heavy Simulation (if added in future)
29
+ - Pure Mathematical Analysis only
30
+ """
31
+ @staticmethod
32
+ def analyze(report: FinancialReport, budget: Optional[BudgetModel] = None, comparisons: Optional[List[FinancialReport]] = None, user_address: Optional[str] = None, enabled_features: List[str] = []) -> Dict[str, Any]:
33
+
34
+
35
+ # Run Pure Math Analysis
36
+ analyzer = FinancialAnalyzer(report)
37
+ results = analyzer.run_full_analysis(budget, comparisons, user_address)
38
+
39
+ # Tag result as Lite
40
+ results['meta'] = {
41
+ "engine": "Visi-Insight-2 (Lite)",
42
+ "optimized": True
43
+ }
44
+
45
+ # Explicitly exclude heavy/external modules like GeoService
46
+ results['geo_analysis'] = None
47
+
48
+ return results
app/services/analysis/factory.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.models.user import User
2
+ from app.services.analysis.fundamental import FundamentalAnalyzer
3
+ from app.services.analysis.engine_lite import LiteAnalyzer
4
+
5
+ class AnalysisFactory:
6
+ @staticmethod
7
+ def get_analyzer(user: User):
8
+ """
9
+ Returns the appropriate analyzer class based on user preference.
10
+ Defaults to Standard (V1) if not specified.
11
+ """
12
+ # Feature Flag / Engine Selection
13
+ engine_pref = getattr(user, 'preferred_engine', 'v1')
14
+
15
+ if engine_pref == 'v2':
16
+ return LiteAnalyzer
17
+ else:
18
+ return FundamentalAnalyzer
app/services/analysis/fundamental.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from pathlib import Path
3
+ from typing import List, Optional, Dict, Any
4
+
5
+ # Ensure project root is in path so we can import 'financial_model' package
6
+ try:
7
+ current_file = Path(__file__).resolve()
8
+ project_root = current_file.parents[4]
9
+ if (project_root / "financial_model").exists():
10
+ if str(project_root) not in sys.path:
11
+ sys.path.insert(0, str(project_root))
12
+ except Exception:
13
+ pass
14
+
15
+ from app.schemas.financial import (
16
+ FinancialReport,
17
+ BudgetModel,
18
+ StandardizedDataPackage
19
+ )
20
+ # Import Core Logic from Library Package
21
+ try:
22
+ from financial_model.core import FinancialAnalyzer
23
+ except ImportError:
24
+ # If path setup failed, try forcing the path
25
+ sys.path.insert(0, "../../../../../")
26
+ from financial_model.core import FinancialAnalyzer
27
+
28
+ class FundamentalAnalyzer:
29
+ @staticmethod
30
+ def analyze(report: FinancialReport, budget: Optional[BudgetModel] = None, comparisons: Optional[List[FinancialReport]] = None, user_address: Optional[str] = None, enabled_features: List[str] = []) -> Dict[str, Any]:
31
+
32
+ """
33
+ Main entry point for analysis.
34
+ Delegates core logic to the independent 'financial_model' library.
35
+ Enhances result with external services (GeoService).
36
+ """
37
+ # 1. Run Pure Financial Analysis (Library)
38
+ analyzer = FinancialAnalyzer(report)
39
+ results = analyzer.run_full_analysis(budget, comparisons, user_address)
40
+
41
+ # 2. Inject External Services (Geo Intelligence)
42
+ # This keeps the library pure and the backend handling integration
43
+
44
+ geo_analysis = None
45
+ analysis_address = None
46
+ is_own_company = False
47
+
48
+ if hasattr(report, 'company_address') and report.company_address:
49
+ analysis_address = report.company_address
50
+ if user_address and user_address.lower().strip() == report.company_address.lower().strip():
51
+ is_own_company = True
52
+ elif user_address:
53
+ analysis_address = user_address
54
+ is_own_company = True
55
+ else:
56
+ analysis_address = f"{report.company_name} Location"
57
+
58
+ if "geo_insights" in enabled_features and analysis_address:
59
+ try:
60
+ from app.services.intelligence.geo_service import GeoService
61
+ geo_analysis = GeoService.analyze_location(
62
+ analysis_address,
63
+ report.metrics.industry,
64
+ is_own_company=is_own_company,
65
+ company_name=report.company_name
66
+ )
67
+ except ImportError:
68
+ print("Warning: GeoService not available.")
69
+ except Exception as e:
70
+ print(f"Error in GeoService: {e}")
71
+
72
+ if geo_analysis:
73
+ results['geo_analysis'] = geo_analysis
74
+
75
+ return results
app/services/analysis/growth.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.schemas.financial import FinancialReport
2
+
3
+ class GrowthAnalyzer:
4
+ @staticmethod
5
+ def analyze_growth_potential(report: FinancialReport) -> str:
6
+ """
7
+ A modular analyzer that looks for growth signals.
8
+ """
9
+ signals = []
10
+
11
+ # In a real model, this would compare current vs previous periods.
12
+ # Since we only have one period in the standard import, we use heuristics or "Time Series" placeholder logic.
13
+
14
+ income = report.income_statement
15
+
16
+ if income.revenue > 1_000_000:
17
+ signals.append("High Volume Business: Revenue > $1M suggests established market presence.")
18
+
19
+ if income.operating_income and income.revenue:
20
+ if (income.operating_income / income.revenue) > 0.20:
21
+ signals.append("Scalable Model: Operating margins > 20% indicate high growth potential.")
22
+
23
+ if not signals:
24
+ return "Growth Potential: Stable / Needs more historical data."
25
+
26
+ return "Growth Potential: " + " ".join(signals)
app/services/analysis/health_score.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.schemas.financial import KPIMetrics, HealthScoreBreakdown
2
+
3
+ class HealthScoreAnalyzer:
4
+ @staticmethod
5
+ def calculate(metrics: KPIMetrics) -> HealthScoreBreakdown:
6
+ # 1. Stability (Liquidity/Debt) - Max 25
7
+ stability = 0
8
+ if metrics.current_ratio:
9
+ if metrics.current_ratio > 1.5: stability += 15
10
+ elif metrics.current_ratio > 1.0: stability += 10
11
+ if metrics.debt_to_equity:
12
+ if metrics.debt_to_equity < 1.0: stability += 10
13
+ elif metrics.debt_to_equity < 2.0: stability += 5
14
+ else:
15
+ # Assume acceptable if no debt info
16
+ stability += 10
17
+
18
+ # 2. Profitability (Margins) - Max 35
19
+ profitability = 0
20
+ if metrics.net_margin:
21
+ if metrics.net_margin > 15: profitability += 15
22
+ elif metrics.net_margin > 5: profitability += 10
23
+ elif metrics.net_margin > 0: profitability += 5
24
+ if metrics.gross_margin:
25
+ if metrics.gross_margin > 40: profitability += 10
26
+ elif metrics.gross_margin > 20: profitability += 5
27
+ if metrics.roe:
28
+ if metrics.roe > 15: profitability += 10
29
+
30
+ # 3. Growth (Placeholder / Revenue Trajectory) - Max 20
31
+ # In single snapshot, we check generic health markers
32
+ growth = 10 # Baseline
33
+
34
+ # 4. Efficiency - Max 20
35
+ efficiency = 10 # Baseline
36
+ if metrics.dso and metrics.dso < 45: efficiency += 10
37
+
38
+ total = min(100, stability + profitability + growth + efficiency)
39
+
40
+ return HealthScoreBreakdown(
41
+ stability=stability,
42
+ profitability=profitability,
43
+ growth=growth,
44
+ efficiency=efficiency,
45
+ total_score=total
46
+ )
app/services/analysis/kpi.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.schemas.financial import IncomeStatementStandard, BalanceSheetStandard, CashFlowStandard, KPIMetrics
2
+ from .registry import KPIRegistry
3
+
4
+ class KPIAnalyzer:
5
+ @staticmethod
6
+ def initialize_default_kpis():
7
+ """ Registers the standard Visi-Insight KPIs into the dynamic engine. """
8
+
9
+ # Helper to avoid division by zero
10
+ safe_div = lambda num, den: (num / den) * 100 if den and den != 0 else 0.0
11
+
12
+ # Profitability
13
+ KPIRegistry.register(
14
+ "gross_margin", "Gross Profit Margin (%)", "Profitability",
15
+ lambda r: safe_div(r.income_statement.gross_profit, r.income_statement.revenue or 1)
16
+ )
17
+ KPIRegistry.register(
18
+ "operating_margin", "Operating Income Margin (%)", "Profitability",
19
+ lambda r: safe_div(r.income_statement.operating_income, r.income_statement.revenue or 1)
20
+ )
21
+ KPIRegistry.register(
22
+ "net_margin", "Net Profit Margin (%)", "Profitability",
23
+ lambda r: safe_div(r.income_statement.net_income, r.income_statement.revenue or 1)
24
+ )
25
+
26
+ # Liquidity
27
+ KPIRegistry.register(
28
+ "current_ratio", "Current Ratio", "Liquidity",
29
+ lambda r: r.balance_sheet.total_current_assets / (r.balance_sheet.total_current_liabilities or 1)
30
+ )
31
+
32
+ # Solvency
33
+ KPIRegistry.register(
34
+ "debt_to_equity", "Debt to Equity Ratio", "Solvency",
35
+ lambda r: r.balance_sheet.total_liabilities / (r.balance_sheet.total_equity or 1) if r.balance_sheet.total_liabilities else 0.0
36
+ )
37
+ KPIRegistry.register(
38
+ "roe", "Return on Equity (%)", "Solvency",
39
+ lambda r: safe_div(r.income_statement.net_income, r.balance_sheet.total_equity or 1)
40
+ )
41
+
42
+ # Efficiency
43
+ KPIRegistry.register(
44
+ "dso", "Days Sales Outstanding", "Efficiency",
45
+ lambda r: r.balance_sheet.accounts_receivable / (r.income_statement.revenue / 365) if r.income_statement.revenue and r.income_statement.revenue > 0 and r.balance_sheet.accounts_receivable else 0.0
46
+ )
47
+
48
+ # Specific
49
+ KPIRegistry.register(
50
+ "prime_cost", "Prime Cost (%)", "Service/Restaurant",
51
+ lambda r: safe_div(r.income_statement.cogs + r.income_statement.payroll_expenses, r.income_statement.revenue or 1)
52
+ )
53
+
54
+ @staticmethod
55
+ def calculate_metrics(report: 'FinancialReport') -> KPIMetrics:
56
+ # 1. Ensure the default registry is loaded
57
+ if not KPIRegistry.get_all_kpis():
58
+ KPIAnalyzer.initialize_default_kpis()
59
+
60
+ # 2. Evaluate all dynamic registry KPIs against the current report object
61
+ # The dynamic engine returns a dict of standard KPI names -> float values
62
+ dynamic_results = KPIRegistry.evaluate_all(report)
63
+
64
+ # 3. Apply standard backwards-compatible assignments on the Pydantic schema
65
+ metrics = KPIMetrics()
66
+ metrics.gross_margin = dynamic_results.pop("gross_margin", 0.0)
67
+ metrics.operating_margin = dynamic_results.pop("operating_margin", 0.0)
68
+ metrics.net_margin = dynamic_results.pop("net_margin", 0.0)
69
+
70
+ metrics.current_ratio = dynamic_results.pop("current_ratio", 0.0)
71
+ metrics.debt_to_equity = dynamic_results.pop("debt_to_equity", 0.0)
72
+ metrics.roe = dynamic_results.pop("roe", 0.0)
73
+ metrics.dso = dynamic_results.pop("dso", 0.0)
74
+ metrics.prime_cost = dynamic_results.pop("prime_cost", 0.0)
75
+
76
+ # All remaining dynamically registered KPIs go into the custom_metrics dict
77
+ metrics.custom_metrics = dynamic_results
78
+
79
+ # Extracted or Calculated Extra Metrics (Metadata)
80
+ if "extracted_restaurant_margin" in report.metadata:
81
+ try:
82
+ metrics.restaurant_margin = float(report.metadata["extracted_restaurant_margin"])
83
+ except:
84
+ pass
85
+
86
+ if "extracted_effective_tax_rate" in report.metadata:
87
+ try:
88
+ metrics.effective_tax_rate = float(report.metadata["extracted_effective_tax_rate"])
89
+ except:
90
+ pass
91
+ elif report.income_statement.taxes > 0 and report.income_statement.net_income > 0:
92
+ pre_tax = report.income_statement.net_income + report.income_statement.taxes
93
+ metrics.effective_tax_rate = (report.income_statement.taxes / pre_tax) * 100
94
+
95
+ return metrics
app/services/analysis/registry.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Callable, Dict, Any, Optional
2
+
3
+ class KPIRegistry:
4
+ """
5
+ Dynamic KPI Registry for Visi-Insight Phase 3.
6
+ This allows us to untether KPI math from hardcoded backend logic,
7
+ creating a plug-and-play architecture where formulas can be defined,
8
+ validated, and executed on the fly using a central registry.
9
+ """
10
+
11
+ _registry: Dict[str, Dict[str, Any]] = {}
12
+
13
+ @classmethod
14
+ def register(cls, name: str, description: str, category: str, formula: Callable[[Any], float]):
15
+ """
16
+ Registers a new KPI formula.
17
+ :param name: Unique identifier for the KPI (e.g. 'gross_margin')
18
+ :param description: Human-readable description
19
+ :param category: Category logic group (e.g. 'profitability')
20
+ :param formula: Lambda or function taking the `FinancialReport` object and returning a float
21
+ """
22
+ cls._registry[name] = {
23
+ "description": description,
24
+ "category": category,
25
+ "formula": formula
26
+ }
27
+
28
+ @classmethod
29
+ def get_formula(cls, name: str) -> Optional[Callable]:
30
+ return cls._registry.get(name, {}).get("formula")
31
+
32
+ @classmethod
33
+ def get_all_kpis(cls) -> Dict[str, Dict[str, Any]]:
34
+ return cls._registry
35
+
36
+ @classmethod
37
+ def evaluate(cls, name: str, report: Any) -> float:
38
+ """
39
+ Evaluates a single registered KPI formula safely against a report.
40
+ """
41
+ formula = cls.get_formula(name)
42
+ if not formula:
43
+ raise KeyError(f"KPI '{name}' is not registered in the Dynamic KPI Engine.")
44
+
45
+ try:
46
+ return formula(report)
47
+ except Exception as e:
48
+ # Handle math errors like division by zero gracefully
49
+ return 0.0
50
+
51
+ @classmethod
52
+ def evaluate_all(cls, report: Any) -> Dict[str, float]:
53
+ """
54
+ Evaluates all registered KPIs for a given report.
55
+ Returns a flat dictionary mapping the KPI name to its calculated value.
56
+ """
57
+ results = {}
58
+ for kpi_name in cls._registry.keys():
59
+ results[kpi_name] = cls.evaluate(kpi_name, report)
60
+ return results
61
+
62
+ @classmethod
63
+ def clear(cls):
64
+ """ Clears all loaded KPIs from the registry. Mostly for testing or hot-reloading. """
65
+ cls._registry = {}
app/services/analysis/risk.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ from app.schemas.financial import KPIMetrics, RiskAnalysis
3
+
4
+ class RiskAnalyzer:
5
+ @staticmethod
6
+ def analyze(metrics: KPIMetrics, balance_cash: float = 0.0, monthly_burn: float = 0.0) -> RiskAnalysis:
7
+ score = 100.0
8
+ factors = []
9
+ liquidity = "Low Risk" # Default assumes good
10
+ solvency = "Low Risk"
11
+
12
+ # 1. Liquidity Risk (Current Ratio)
13
+ if metrics.current_ratio:
14
+ if metrics.current_ratio < 1.0:
15
+ score -= 20
16
+ factors.append("Critical: Current Ratio < 1.0 (Liquidity Issue)")
17
+ liquidity = "Critical"
18
+ elif metrics.current_ratio < 1.5:
19
+ score -= 10
20
+ factors.append("Warning: Current Ratio < 1.5")
21
+ liquidity = "Medium"
22
+ else:
23
+ factors.append("Unknown: Missing Current Ratio data")
24
+
25
+ # 2. Solvency Risk (Debt to Equity)
26
+ if metrics.debt_to_equity:
27
+ if metrics.debt_to_equity > 2.0:
28
+ score -= 15
29
+ factors.append("High Leverage: Debt/Equity > 2.0")
30
+ solvency = "High Risk"
31
+ elif metrics.debt_to_equity > 1.0:
32
+ solvency = "Medium Risk"
33
+
34
+ # 3. Profitability Risk
35
+ if metrics.net_margin and metrics.net_margin < 0:
36
+ score -= 25
37
+ factors.append("Loss Making: Negative Net Margin")
38
+
39
+ # 4. Burn Rate (Runway)
40
+ runway_months = None
41
+ if monthly_burn > 0:
42
+ runway_months = balance_cash / monthly_burn
43
+ if runway_months < 3:
44
+ score -= 25
45
+ factors.append(f"CRITICAL: Low Cash Runway ({runway_months:.1f} months)")
46
+ liquidity = "Critical"
47
+ elif runway_months < 6:
48
+ score -= 10
49
+ factors.append(f"Warning: Cash Runway < 6 months ({runway_months:.1f} months)")
50
+
51
+ return RiskAnalysis(
52
+ risk_score=max(0.0, score),
53
+ risk_factors=factors,
54
+ liquidity_risk=liquidity,
55
+ solvency_risk=solvency,
56
+ burn_rate_months=runway_months
57
+ )
app/services/analysis/simulation.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.schemas.financial import FinancialReport, StandardizedDataPackage, KPIMetrics, RiskAnalysis, IncomeStatementStandard
2
+ from app.services.analysis.kpi import KPIAnalyzer
3
+ from app.services.analysis.risk import RiskAnalyzer
4
+ from app.services.analysis.health_score import HealthScoreAnalyzer
5
+ from app.services.analysis.fundamental import FundamentalAnalyzer
6
+ import copy
7
+
8
+ class SimulationService:
9
+ @staticmethod
10
+ def run_simulation(
11
+ original_data: FinancialReport,
12
+ delta_revenue_percent: float = 0.0,
13
+ delta_cogs_percent: float = 0.0,
14
+ delta_payroll_percent: float = 0.0,
15
+ delta_marketing_percent: float = 0.0,
16
+ delta_fixed_costs_percent: float = 0.0
17
+ ) -> StandardizedDataPackage:
18
+ """
19
+ Runs a What-If scenario on the financial data.
20
+ Delta percentages are passed as floats (e.g., 10.0 for +10%).
21
+ """
22
+
23
+ # Deep copy to avoid mutating original
24
+ simulated_report = copy.deepcopy(original_data)
25
+ income = simulated_report.income_statement
26
+
27
+ # Apply deltas
28
+ if delta_revenue_percent != 0:
29
+ income.revenue *= (1 + delta_revenue_percent / 100)
30
+
31
+ if delta_cogs_percent != 0:
32
+ income.cogs *= (1 + delta_cogs_percent / 100)
33
+
34
+ if delta_payroll_percent != 0:
35
+ income.payroll_expenses *= (1 + delta_payroll_percent / 100)
36
+
37
+ if delta_marketing_percent != 0:
38
+ income.marketing_expenses *= (1 + delta_marketing_percent / 100)
39
+
40
+ if delta_fixed_costs_percent != 0:
41
+ income.rent_expense *= (1 + delta_fixed_costs_percent / 100)
42
+ income.other_operating_expenses *= (1 + delta_fixed_costs_percent / 100)
43
+
44
+ # Re-calculate dependent fields
45
+ # Note: In a real complex model, variable costs might scale with revenue automatically.
46
+ # Here we assume structure stays static unless explicitly modified.
47
+
48
+ # Re-run Full Analysis (Phase 3 Update)
49
+ # Instead of calling individual analyzers, call the main FundamentalAnalyzer
50
+ # This ensures simulated data gets Runway, Optimization, etc.
51
+
52
+ full_analysis = FundamentalAnalyzer.analyze(simulated_report)
53
+
54
+ # Override insights to show what changed
55
+ sim_summary = f"Simulation: Rev {delta_revenue_percent:+.0f}%, COGS {delta_cogs_percent:+.0f}%, Mkt {delta_marketing_percent:+.0f}%, Fixed {delta_fixed_costs_percent:+.0f}%"
56
+ full_analysis['insights'].insert(0, sim_summary)
57
+
58
+ return StandardizedDataPackage(
59
+ raw_data=simulated_report,
60
+ kpis=full_analysis['kpis'],
61
+ risk_analysis=full_analysis['risk_analysis'],
62
+ health_score=full_analysis['health_score'],
63
+ insights=full_analysis['insights'],
64
+ recommendations=full_analysis['recommendations'],
65
+ runway_forecast=full_analysis['runway_forecast'],
66
+ optimization_insights=full_analysis['optimization_insights']
67
+ )
app/services/feature_service.py ADDED
@@ -0,0 +1,306 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Feature Service - Business logic for feature flag management.
3
+
4
+ Handles the resolution of feature availability considering:
5
+ 1. Admin overrides (from database)
6
+ 2. Plan defaults (from plan_config.py)
7
+ 3. Feature registry validation
8
+ """
9
+
10
+ from typing import List, Dict, Optional, Any
11
+ from sqlalchemy.orm import Session
12
+ from datetime import datetime, timedelta
13
+
14
+ from app.core.feature_registry import (
15
+ get_all_features,
16
+ get_feature_by_id,
17
+ get_all_feature_ids,
18
+ get_features_by_category,
19
+ Feature
20
+ )
21
+ from app.core.plan_config import (
22
+ get_default_features,
23
+ get_upload_limit as get_default_upload_limit,
24
+ get_all_plans,
25
+ get_all_engines,
26
+ PLAN_DEFAULTS
27
+ )
28
+ from app.models.feature_flags import PlanFeatureOverride, PlanUploadLimit
29
+ from app.models.user import User
30
+
31
+
32
+ def get_effective_features(db: Session, plan_name: str) -> List[str]:
33
+ """
34
+ Get the list of enabled feature IDs for a plan,
35
+ considering admin overrides.
36
+
37
+ Resolution order:
38
+ 1. Start with plan defaults from plan_config.py
39
+ 2. Apply any overrides from database
40
+ """
41
+ # Get default features for plan
42
+ default_features = set(get_default_features(plan_name))
43
+
44
+ # Get all overrides for this plan
45
+ overrides = db.query(PlanFeatureOverride).filter(
46
+ PlanFeatureOverride.plan_name == plan_name
47
+ ).all()
48
+
49
+ # Apply overrides
50
+ for override in overrides:
51
+ if override.enabled:
52
+ default_features.add(override.feature_id)
53
+ else:
54
+ default_features.discard(override.feature_id)
55
+
56
+ return list(default_features)
57
+
58
+
59
+ def is_feature_enabled(db: Session, plan_name: str, feature_id: str) -> bool:
60
+ """Check if a specific feature is enabled for a plan."""
61
+ enabled_features = get_effective_features(db, plan_name)
62
+ return feature_id in enabled_features
63
+
64
+
65
+ def resolve_user_features(db: Session, user: User) -> List[str]:
66
+ """
67
+ Resolve final feature flags for a user, combining:
68
+ 1. Plan Entitlements (Base)
69
+ 2. User-Specific Overrides (Add-ons/Removals) -> stored in user.custom_features
70
+ 3. Engine Constraints (Hard Limit)
71
+
72
+ Returns: List of enabled feature IDs.
73
+ """
74
+ # 1. Base Plan Features
75
+ current_plan = user.plan or "Free"
76
+ if user.is_admin:
77
+ current_plan = "Admin"
78
+
79
+ plan_features = set(get_effective_features(db, current_plan))
80
+
81
+ # 2. Apply User Custom Overrides (Add-ons / Removals)
82
+ # user.custom_features is a JSON dict { "feature_id": bool }
83
+ # Ensure it's a dict (SQLAlchemy JSON might return None if default not applied yet)
84
+ custom_map = user.custom_features or {}
85
+ if isinstance(custom_map, str):
86
+ # Handle case with SQLite where it might be stored as string
87
+ import json
88
+ try:
89
+ custom_map = json.loads(custom_map)
90
+ except:
91
+ custom_map = {}
92
+
93
+ for fid, enabled in custom_map.items():
94
+ if enabled:
95
+ plan_features.add(fid)
96
+ elif fid in plan_features:
97
+ plan_features.remove(fid)
98
+
99
+ # 3. Apply Engine Constraints (Hardware Limits)
100
+ # Default to v1 if not set
101
+ engine_pref = getattr(user, "preferred_engine", "v1") or "v1"
102
+ engine_key = f"_ENGINE_{engine_pref}"
103
+
104
+ # Get engine allowed features
105
+ engine_features = set(get_effective_features(db, engine_key))
106
+
107
+ # Final Result = (Plan U Custom) INTERSECT Engine
108
+ return list(plan_features.intersection(engine_features))
109
+
110
+
111
+
112
+ def get_effective_upload_limit(db: Session, plan_name: str) -> int:
113
+ """
114
+ Get the upload limit for a plan, considering admin overrides.
115
+ """
116
+ # Check for override
117
+ override = db.query(PlanUploadLimit).filter(
118
+ PlanUploadLimit.plan_name == plan_name
119
+ ).first()
120
+
121
+ if override:
122
+ return override.upload_limit
123
+
124
+ return get_default_upload_limit(plan_name)
125
+
126
+
127
+ def get_all_plan_features(db: Session) -> Dict[str, Dict[str, Any]]:
128
+ """
129
+ Get feature configuration for all plans.
130
+ Returns a dict with plan names as keys and feature configs as values.
131
+ """
132
+ all_feature_ids = get_all_feature_ids()
133
+ result = {}
134
+
135
+ for plan_name in get_all_plans():
136
+ enabled_features = get_effective_features(db, plan_name)
137
+ upload_limit = get_effective_upload_limit(db, plan_name)
138
+
139
+ result[plan_name] = {
140
+ "upload_limit": upload_limit,
141
+ "features": {
142
+ fid: fid in enabled_features
143
+ for fid in all_feature_ids
144
+ }
145
+ }
146
+
147
+ return result
148
+
149
+
150
+ def get_feature_matrix(db: Session) -> Dict[str, Any]:
151
+ """
152
+ Get feature matrix for admin console display.
153
+ Includes categories, features, and per-plan enablement.
154
+ """
155
+ categories = get_features_by_category()
156
+ plans = get_all_plans()
157
+ engines = get_all_engines()
158
+
159
+ # Build matrix
160
+ matrix = {}
161
+ for cat_name, features in categories.items():
162
+ matrix[cat_name] = []
163
+ for feature in features:
164
+ row = {
165
+ "id": feature.id,
166
+ "name": feature.name,
167
+ "description": feature.description,
168
+ "memory_cost_mb": getattr(feature, "memory_cost_mb", 0),
169
+ "plans": {},
170
+ "engines": {}
171
+ }
172
+ for plan in plans:
173
+ row["plans"][plan] = is_feature_enabled(db, plan, feature.id)
174
+ for engine in engines:
175
+ row["engines"][engine] = is_feature_enabled(db, engine, feature.id)
176
+ matrix[cat_name].append(row)
177
+
178
+ return {
179
+ "categories": list(categories.keys()),
180
+ "plans": plans,
181
+ "engines": engines,
182
+ "matrix": matrix
183
+ }
184
+
185
+
186
+ def set_feature_override(
187
+ db: Session,
188
+ plan_name: str,
189
+ feature_id: str,
190
+ enabled: bool,
191
+ admin_id: Optional[int] = None
192
+ ) -> PlanFeatureOverride:
193
+ """
194
+ Set or update a feature override for a plan.
195
+ """
196
+ # Validate feature exists
197
+ if not get_feature_by_id(feature_id):
198
+ raise ValueError(f"Unknown feature ID: {feature_id}")
199
+
200
+ # Find or create override
201
+ override = db.query(PlanFeatureOverride).filter(
202
+ PlanFeatureOverride.plan_name == plan_name,
203
+ PlanFeatureOverride.feature_id == feature_id
204
+ ).first()
205
+
206
+ if override:
207
+ override.enabled = enabled
208
+ override.updated_by_id = admin_id
209
+ else:
210
+ override = PlanFeatureOverride(
211
+ plan_name=plan_name,
212
+ feature_id=feature_id,
213
+ enabled=enabled,
214
+ updated_by_id=admin_id
215
+ )
216
+ db.add(override)
217
+
218
+ db.commit()
219
+ db.refresh(override)
220
+ return override
221
+
222
+
223
+ def bulk_set_features(
224
+ db: Session,
225
+ plan_name: str,
226
+ feature_states: Dict[str, bool],
227
+ admin_id: Optional[int] = None
228
+ ) -> int:
229
+ """
230
+ Bulk update feature states for a plan.
231
+ Returns count of updated features.
232
+ """
233
+ count = 0
234
+ for feature_id, enabled in feature_states.items():
235
+ set_feature_override(db, plan_name, feature_id, enabled, admin_id)
236
+ count += 1
237
+ return count
238
+
239
+
240
+ def reset_plan_to_defaults(db: Session, plan_name: str) -> int:
241
+ """
242
+ Remove all overrides for a plan, reverting to defaults.
243
+ Returns count of deleted overrides.
244
+ """
245
+ result = db.query(PlanFeatureOverride).filter(
246
+ PlanFeatureOverride.plan_name == plan_name
247
+ ).delete()
248
+ db.commit()
249
+ return result
250
+
251
+
252
+ def check_upload_limit(db: Session, user: User) -> Dict[str, Any]:
253
+ """
254
+ Check if user can upload, considering their plan limit.
255
+ Also handles monthly reset.
256
+
257
+ Returns:
258
+ {
259
+ "can_upload": bool,
260
+ "uploads_used": int,
261
+ "uploads_limit": int,
262
+ "uploads_remaining": int,
263
+ "reset_date": datetime
264
+ }
265
+ """
266
+ # Check if we need to reset monthly count
267
+ now = datetime.utcnow()
268
+ if user.upload_reset_date:
269
+ days_since_reset = (now - user.upload_reset_date).days
270
+ if days_since_reset >= 30:
271
+ user.monthly_upload_count = 0
272
+ user.upload_reset_date = now
273
+ db.commit()
274
+ else:
275
+ user.upload_reset_date = now
276
+ db.commit()
277
+
278
+ # Get effective limit
279
+ plan = user.plan or "Individual"
280
+ if user.is_admin:
281
+ plan = "Admin"
282
+
283
+ limit = get_effective_upload_limit(db, plan)
284
+ used = user.monthly_upload_count or 0
285
+ remaining = max(0, limit - used)
286
+
287
+ # Calculate next reset
288
+ next_reset = user.upload_reset_date + timedelta(days=30) if user.upload_reset_date else now + timedelta(days=30)
289
+
290
+ return {
291
+ "can_upload": used < limit,
292
+ "uploads_used": used,
293
+ "uploads_limit": limit,
294
+ "uploads_remaining": remaining,
295
+ "reset_date": next_reset.isoformat()
296
+ }
297
+
298
+
299
+ def increment_upload_count(db: Session, user: User) -> int:
300
+ """
301
+ Increment user's upload count. Call after successful upload.
302
+ Returns new count.
303
+ """
304
+ user.monthly_upload_count = (user.monthly_upload_count or 0) + 1
305
+ db.commit()
306
+ return user.monthly_upload_count
app/services/ingestion/__init__.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ingestion Layer - File parsing and data extraction.
3
+
4
+ This package handles parsing of various financial document formats
5
+ and standardizing them into a common FinancialReport schema.
6
+
7
+ ## Supported Formats
8
+
9
+ | Format | Parser | Description |
10
+ |--------|--------|-------------|
11
+ | CSV | CSVParser | Comma-separated financial data |
12
+ | PDF | HybridPDFParser | Dolphin-v2 + pdfplumber hybrid extraction |
13
+ | PDF | PDFParser | Legacy pdfplumber-only parser |
14
+ | XLSX/XLS | XLSXParser | Excel workbooks |
15
+
16
+ ## PDF Hybrid Architecture
17
+
18
+ PDF files are processed by both Dolphin-v2 and pdfplumber:
19
+ 1. Dolphin: layout analysis, document classification, element extraction
20
+ 2. pdfplumber: gap-filling table + regex extraction
21
+ 3. Merge: Dolphin fields take priority, pdfplumber fills gaps
22
+
23
+ If Dolphin is not installed, falls back to pdfplumber-only automatically.
24
+
25
+ ## Usage
26
+
27
+ Use UnifiedParser for automatic format detection:
28
+
29
+ ```python
30
+ from app.services.ingestion import UnifiedParser
31
+
32
+ report = UnifiedParser.parse(file_path, original_filename)
33
+ ```
34
+
35
+ Or use specific parsers directly:
36
+
37
+ ```python
38
+ from app.services.ingestion import CSVParser, HybridPDFParser, XLSXParser
39
+
40
+ report = CSVParser.parse(file_path)
41
+ report = HybridPDFParser.parse(file_path) # Dolphin + pdfplumber
42
+ report = XLSXParser.parse(file_path)
43
+ ```
44
+
45
+ ## Adding New Formats
46
+
47
+ 1. Create `parser_xxx.py` with a class implementing `parse(file_path) -> FinancialReport`
48
+ 2. Register in `unified_parser.py` SUPPORTED_EXTENSIONS dict
49
+ 3. Add import in this `__init__.py`
50
+ """
51
+
52
+ from app.services.ingestion.unified_parser import UnifiedParser
53
+ from app.services.ingestion.parser_csv import CSVParser
54
+ from app.services.ingestion.parser_pdf import PDFParser
55
+ from app.services.ingestion.parser_dolphin import HybridPDFParser
56
+ from app.services.ingestion.parser_xlsx import XLSXParser
57
+ from app.services.ingestion.mappings import DataMapper
app/services/ingestion/doc_keywords.py ADDED
@@ -0,0 +1,1408 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Document Keyword Registry — 53 Financial Document Types
3
+ ========================================================
4
+ Central registry mapping document types to their identifying keywords and
5
+ extractable field variables. Used by the classifier to identify uploaded
6
+ documents and guide targeted extraction.
7
+
8
+ Generated from the doc-keywords-mapped reference data.
9
+ Learned keywords from admin training are loaded from learned_keywords.json
10
+ and merged at startup.
11
+ """
12
+
13
+ import json
14
+ import os
15
+ import logging
16
+ from dataclasses import dataclass, field
17
+ from typing import Dict, List, Optional
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ @dataclass
23
+ class DocTypeDefinition:
24
+ """Definition of a single financial document type."""
25
+ id: int
26
+ display_name: str
27
+ keywords: List[str] # Keywords used for classification (case-insensitive)
28
+ fields: List[str] # Extractable FIELD: variables
29
+ category: str = "general" # Category grouping for UI display
30
+ min_keyword_matches: int = 2 # Minimum keyword hits before scoring
31
+
32
+
33
+ # ============================================================================
34
+ # 53 DOCUMENT TYPE DEFINITIONS
35
+ # ============================================================================
36
+
37
+ DOC_TYPE_REGISTRY: Dict[str, DocTypeDefinition] = {
38
+
39
+ # ── 1. SOC 2 Audit Report ───────────────────────────────────────────
40
+ "soc_2_audit": DocTypeDefinition(
41
+ id=1,
42
+ display_name="SOC 2 Audit Report",
43
+ category="compliance",
44
+ keywords=[
45
+ "INDEPENDENT SERVICE AUDITOR'S REPORT", "SOC 2 TYPE",
46
+ "SYSTEM DESCRIPTION", "TRUST SERVICES CRITERIA",
47
+ "RELEVANT TO SECURITY", "AICPA",
48
+ "ASSERTION OF MANAGEMENT", "SECTION III", "SECTION IV",
49
+ "SOC 2", "SYSTEM AND ORGANIZATION CONTROLS 2",
50
+ "TRUST SERVICES PRINCIPLES", "SECURITY", "AVAILABILITY",
51
+ "PROCESSING INTEGRITY", "CONFIDENTIALITY", "PRIVACY",
52
+ "CONTROL ACTIVITIES", "CONTROL ENVIRONMENT",
53
+ "MONITORING CONTROLS", "opinion", "auditor's conclusion",
54
+ "basis for opinion", "independent auditor's report",
55
+ "qualified/unqualified opinion",
56
+ "opinion on the fairness of the presentation",
57
+ "period covered", "review period", "examination period",
58
+ "service organization", "company name", "organization",
59
+ "exceptions noted", "deviations", "testing exceptions",
60
+ "no exceptions noted", "control exceptions", "deficiencies",
61
+ "weaknesses",
62
+ ],
63
+ fields=[
64
+ "auditor_opinion", "period_covered", "service_organization",
65
+ "control_exceptions", "criteria",
66
+ ],
67
+ ),
68
+
69
+ # ── 2. ARR/MRR Waterfall ────────────────────────────────────────────
70
+ "arr_mrr_waterfall": DocTypeDefinition(
71
+ id=2,
72
+ display_name="ARR/MRR Waterfall",
73
+ category="saas_metrics",
74
+ keywords=[
75
+ "ARR WATERFALL", "MRR ROLLFORWARD",
76
+ "RECURRING REVENUE BRIDGE", "SAAS METRICS",
77
+ "ANNUALIZED RECURRING REVENUE", "CARR",
78
+ "RECURRING REVENUE MOVEMENT", "NET NEW ARR",
79
+ "RECURRING REVENUE WATERFALL", "RECURRING REVENUE ROLLFORWARD",
80
+ "ARR BRIDGE", "MRR BRIDGE",
81
+ "beginning arr", "opening arr", "new logos", "expansion",
82
+ "contraction", "churn", "cancellation", "ending arr",
83
+ "closing arr", "exit arr", "end of period arr", "eop arr",
84
+ "new sales", "upsell", "cross-sell", "price increase",
85
+ "downgrade", "shrinkage", "downsell", "reduction",
86
+ "gross churn", "logo churn", "lost arr",
87
+ "new logo arr", "gross new arr", "new business",
88
+ "new customers", "reactivation",
89
+ ],
90
+ fields=[
91
+ "beginning_arr", "new_logo_arr", "expansion_arr",
92
+ "contraction_arr", "churn_arr", "ending_arr",
93
+ ],
94
+ ),
95
+
96
+ # ── 3. Deferred Revenue Schedule (ASC 606) ──────────────────────────
97
+ "deferred_revenue_schedule": DocTypeDefinition(
98
+ id=3,
99
+ display_name="Deferred Revenue Schedule (ASC 606)",
100
+ category="revenue_recognition",
101
+ keywords=[
102
+ "DEFERRED REVENUE ROLLFORWARD", "CONTRACT LIABILITY SCHEDULE",
103
+ "UNEARNED REVENUE ANALYSIS", "REVENUE RECOGNITION SCHEDULE",
104
+ "ASC 606 DISCLOSURE", "ASC 606", "IFRS 15",
105
+ "SHORT-TERM DEFERRED REVENUE", "LONG-TERM DEFERRED REVENUE",
106
+ "REMAINING PERFORMANCE OBLIGATION",
107
+ "beginning balance", "beginning deferred revenue",
108
+ "opening contract liability", "balance at beginning",
109
+ "billings", "invoiced", "new contracts", "fees billed",
110
+ "revenue recognized", "earned revenue",
111
+ "ending balance", "ending deferred revenue",
112
+ "closing contract liability", "balance at end",
113
+ "satisfaction of performance obligation",
114
+ "transfer to revenue", "amortization",
115
+ "contract with customer liability",
116
+ ],
117
+ fields=[
118
+ "beginning_balance", "billings", "revenue_recognized",
119
+ "ending_balance",
120
+ ],
121
+ ),
122
+
123
+ # ── 4. CAC vs. LTV Model ───────────────────────────────────────────
124
+ "cac_ltv_model": DocTypeDefinition(
125
+ id=4,
126
+ display_name="CAC vs. LTV Model",
127
+ category="saas_metrics",
128
+ keywords=[
129
+ "UNIT ECONOMICS", "LTV/CAC ANALYSIS",
130
+ "CUSTOMER ACQUISITION COST", "COHORT ANALYSIS",
131
+ "LIFETIME VALUE MODEL", "SAAS UNIT ECONOMICS",
132
+ "LTV/CAC RATIO", "MAGIC NUMBER",
133
+ "cac", "customer acquisition cost", "blended cac", "paid cac",
134
+ "cost per acquisition", "ltv", "lifetime value", "cltv",
135
+ "customer lifetime value", "payback period",
136
+ "months to recover", "months to recover cac", "cac payback",
137
+ "arpu", "average revenue per user",
138
+ "average revenue per account", "arpa",
139
+ ],
140
+ fields=[
141
+ "cac", "ltv", "ltv_cac_ratio", "payback_period", "arpu",
142
+ ],
143
+ ),
144
+
145
+ # ── 5. Booking / Backlog Report ─────────────────────────────────────
146
+ "booking_backlog": DocTypeDefinition(
147
+ id=5,
148
+ display_name="Booking / Backlog Report",
149
+ category="sales",
150
+ keywords=[
151
+ "SALES BACKLOG", "REMAINING PERFORMANCE OBLIGATIONS",
152
+ "OPEN ORDERS REPORT", "ORDER BOOK",
153
+ "BOOKINGS BY CUSTOMER", "UNFULFILLED ORDERS",
154
+ "ORDER BACKLOG", "CONTRACT BACKLOG",
155
+ "booking amount", "order value", "total contracted value",
156
+ "remaining obligation", "open balance", "unfilled orders",
157
+ "remaining performance obligation", "remaining contract value",
158
+ "backlog", "open amount", "po amount",
159
+ "booking date", "order date", "signed date",
160
+ "contract effective date", "po date",
161
+ ],
162
+ fields=[
163
+ "customer_name", "contract_value", "backlog_amount",
164
+ "booking_date",
165
+ ],
166
+ ),
167
+
168
+ # ── 6. PCI DSS Compliance ───────────────────────────────────────────
169
+ "pci_dss": DocTypeDefinition(
170
+ id=6,
171
+ display_name="PCI DSS Compliance",
172
+ category="compliance",
173
+ keywords=[
174
+ "ATTESTATION OF COMPLIANCE", "PCI DSS",
175
+ "REPORT ON COMPLIANCE", "DATA SECURITY STANDARD",
176
+ "PAYMENT CARD INDUSTRY", "AOC",
177
+ "PAYMENT CARD INDUSTRY DATA SECURITY STANDARD",
178
+ "SELF-ASSESSMENT QUESTIONNAIRE", "SAQ", "QSA",
179
+ "QUALIFIED SECURITY ASSESSOR", "ASV SCAN",
180
+ "compliant", "non-compliant", "validation status",
181
+ "overall compliance status", "in compliance",
182
+ "merchant level", "service provider level", "merchant tier",
183
+ ],
184
+ fields=[
185
+ "merchant_level", "compliance_status", "assessment_date",
186
+ "qsa_name",
187
+ ],
188
+ ),
189
+
190
+ # ── 7. Sales Tax Nexus ──────────────────────────────────────────────
191
+ "sales_tax_nexus": DocTypeDefinition(
192
+ id=7,
193
+ display_name="Sales Tax Nexus",
194
+ category="tax",
195
+ keywords=[
196
+ "NEXUS STUDY", "ECONOMIC NEXUS", "WAYFAIR ANALYSIS",
197
+ "PHYSICAL PRESENCE TEST", "WAYFAIR",
198
+ "STATE TAX EXPOSURE", "NEXUS DETERMINATION",
199
+ "nexus established", "no nexus", "nexus",
200
+ "physical nexus", "economic nexus",
201
+ "registration required", "sales tax registration",
202
+ "revenue threshold met", "sales threshold",
203
+ "transaction count met", "threshold exceeded",
204
+ "estimated exposure", "potential liability", "tax due",
205
+ "tax at risk", "estimated tax due",
206
+ ],
207
+ fields=[
208
+ "jurisdiction", "threshold_met", "exposure_amount",
209
+ ],
210
+ ),
211
+
212
+ # ── 8. Inventory Aging Report ───────────────────────────────────────
213
+ "inventory_aging": DocTypeDefinition(
214
+ id=8,
215
+ display_name="Inventory Aging Report",
216
+ category="inventory",
217
+ keywords=[
218
+ "INVENTORY AGING", "STOCK AGE ANALYSIS",
219
+ "OBSOLETE INVENTORY", "DEAD STOCK REPORT",
220
+ "SLOW MOVING REPORT", "INVENTORY VALUATION BY AGE",
221
+ "item code", "sku", "part number", "material number",
222
+ "product id", "item id",
223
+ "0-30", "31-60", "61-90", "90+", "120+",
224
+ "0 to 30", "30-60 days", "60-90 days", "over 90",
225
+ "current", "obsolete", "dead stock", "slow moving",
226
+ "days on hand", "total on hand", "extended cost",
227
+ "valuation", "total cost", "total inventory value",
228
+ ],
229
+ fields=[
230
+ "item_id", "0_30_days", "31_60_days", "61_90_days",
231
+ "over_90_days", "total_value",
232
+ ],
233
+ ),
234
+
235
+ # ── 9. GMROI Schedule ───────────────────────────────────────────────
236
+ "gmroi_schedule": DocTypeDefinition(
237
+ id=9,
238
+ display_name="GMROI Schedule",
239
+ category="retail",
240
+ keywords=[
241
+ "GMROI", "GROSS MARGIN RETURN ON INVESTMENT",
242
+ "INVENTORY PERFORMANCE", "MERCHANDISE PERFORMANCE",
243
+ "gross margin", "gross profit",
244
+ "average inventory", "avg inventory cost",
245
+ "average inventory at cost", "average stock",
246
+ "gmroi", "return on inventory", "index",
247
+ "gmroi %", "turnover", "turns", "stock turn",
248
+ "inventory turnover",
249
+ ],
250
+ fields=[
251
+ "gross_margin", "avg_inventory", "gmroi_ratio", "turnover",
252
+ ],
253
+ ),
254
+
255
+ # ── 10. Open-to-Buy (OTB) Plan ─────────────────────────────────────
256
+ "open_to_buy": DocTypeDefinition(
257
+ id=10,
258
+ display_name="Open-to-Buy (OTB) Plan",
259
+ category="retail",
260
+ keywords=[
261
+ "OPEN TO BUY", "OTB PLAN", "MERCHANDISE BUDGET",
262
+ "PURCHASING BUDGET", "RETAIL BUDGET",
263
+ "STOCK TO SALES", "INVENTORY PLAN",
264
+ "projected sales", "planned sales", "sales forecast",
265
+ "forecasted sales", "planned ending inventory",
266
+ "open to buy at retail", "otb purchases",
267
+ "receipt plan", "open to buy", "purchase budget",
268
+ "markdowns", "reductions", "discounts",
269
+ "promotional deductions", "price adjustments",
270
+ "eop inventory",
271
+ ],
272
+ fields=[
273
+ "projected_sales", "markdowns", "eop_inventory",
274
+ "receipts", "gross_sales",
275
+ ],
276
+ ),
277
+
278
+ # ── 11. Sales & Use Tax Filings ─────────────────────────────────────
279
+ "sales_use_tax": DocTypeDefinition(
280
+ id=11,
281
+ display_name="Sales & Use Tax Filings",
282
+ category="tax",
283
+ keywords=[
284
+ "SALES AND USE TAX RETURN", "FORM ST-",
285
+ "MULTISTATE TAX RETURN", "SALES TAX PAYABLE",
286
+ "TAX REMITTANCE", "GROSS RECEIPTS",
287
+ "ST-3", "ST-9",
288
+ "taxable sales", "taxable amount", "taxable receipts",
289
+ "net taxable", "tax collected", "tax due", "total tax",
290
+ "amount remitted", "tax payable",
291
+ "gross sales", "total sales", "gross receipts", "total receipts",
292
+ "filing period",
293
+ ],
294
+ fields=[
295
+ "jurisdiction", "gross_sales", "taxable_sales",
296
+ "tax_collected", "tax_due",
297
+ ],
298
+ ),
299
+
300
+ # ── 12. UNICAP (Sec 263A) ───────────────────────────────────────────
301
+ "unicap": DocTypeDefinition(
302
+ id=12,
303
+ display_name="UNICAP (Sec 263A)",
304
+ category="tax",
305
+ keywords=[
306
+ "SECTION 263A", "UNICAP CALCULATION",
307
+ "SIMPLIFIED PRODUCTION METHOD", "SIMPLIFIED RESALE METHOD",
308
+ "INVENTORY CAPITALIZATION", "ABSORPTION RATIO",
309
+ "HISTORIC ABSORPTION RATIO",
310
+ "costs incurred", "additional costs",
311
+ "capitalizable costs", "capitalized inventory adjustment",
312
+ "section 263a costs", "unicap adjustment",
313
+ "absorption ratio", "historic absorption ratio",
314
+ "allocation ratio", "capitalization rate",
315
+ "ending inventory", "inventory balance", "total inventory",
316
+ ],
317
+ fields=[
318
+ "costs_incurred", "absorption_ratio", "unicap_adjustment",
319
+ "ending_inventory",
320
+ ],
321
+ ),
322
+
323
+ # ── 13. Customer Concentration Report ──────────────────────────────
324
+ "customer_concentration": DocTypeDefinition(
325
+ id=13,
326
+ display_name="Customer Concentration Report",
327
+ category="risk",
328
+ keywords=[
329
+ "CUSTOMER CONCENTRATION", "REVENUE BY CUSTOMER",
330
+ "CLIENT EXPOSURE", "TOP 10 CUSTOMERS",
331
+ "CONSUMER CONCENTRATION REPORT", "SUPPLIER CONCENTRATION REPORT",
332
+ "PARETO ANALYSIS", "WALLET SHARE",
333
+ "concentration %", "cumulative %", "running total %",
334
+ "cumulative share", "share", "percentage of revenue",
335
+ "% of total", "revenue", "total billed", "ytd revenue",
336
+ "annual sales", "customer name", "client",
337
+ ],
338
+ fields=[
339
+ "customer_name", "revenue_amount", "percent_total",
340
+ "cumulative_percent",
341
+ ],
342
+ ),
343
+
344
+ # ── 14. 13-Week Cash Flow Forecast ──────────────────────────────────
345
+ "thirteen_week_cash_flow": DocTypeDefinition(
346
+ id=14,
347
+ display_name="13-Week Cash Flow Forecast",
348
+ category="treasury",
349
+ keywords=[
350
+ "13-WEEK CASH FLOW", "TWELVE WEEK CASH",
351
+ "SHORT TERM LIQUIDITY", "WEEKLY CASH FORECAST",
352
+ "ROLLING CASH FORECAST", "DIRECT METHOD CASH FLOW",
353
+ "ROLLING 13 WEEK CASH FLOW", "SHORT TERM LIQUIDITY FORECAST",
354
+ "13-WEEK ROLLING CASH FLOW",
355
+ "week ending", "week of", "w/e", "week ended",
356
+ "cash receipts", "customer payments", "ar receipts",
357
+ "cash inflows from customers",
358
+ "collections", "cash outflows", "vendor payments",
359
+ "ap payments", "operating disbursements",
360
+ "net cash flow", "net change in cash", "burn",
361
+ "net increase/(decrease)", "ending cash balance",
362
+ "closing cash", "cash position", "period-end cash",
363
+ ],
364
+ fields=[
365
+ "week_ending", "collections", "disbursements",
366
+ "net_cash_flow", "ending_cash",
367
+ ],
368
+ ),
369
+
370
+ # ── 15. Rent Roll ───────────────────────────────────────────────────
371
+ "rent_roll": DocTypeDefinition(
372
+ id=15,
373
+ display_name="Rent Roll",
374
+ category="real_estate",
375
+ keywords=[
376
+ "RENT ROLL", "TENANT ROSTER", "LEASE SCHEDULE",
377
+ "TENANCY SCHEDULE", "PROPERTY RENT ROLL",
378
+ "tenant", "resident", "lessee", "occupant", "tenant name",
379
+ "unit", "suite", "space no", "unit number", "apartment number",
380
+ "lease start", "lease end", "commencement", "expiration",
381
+ "termination date",
382
+ "base rent", "monthly rent", "contract rent", "scheduled rent",
383
+ "market rent", "gross potential rent",
384
+ "total rental revenue", "base rental revenue",
385
+ "vacancy", "vacancy loss", "credit loss", "concessions",
386
+ "effective gross income",
387
+ ],
388
+ fields=[
389
+ "unit_id", "tenant_name", "lease_dates", "base_rent",
390
+ "sq_ft", "security_deposit",
391
+ ],
392
+ ),
393
+
394
+ # ── 16. NOI Statement ───────────────────────────────────────────────
395
+ "noi_statement": DocTypeDefinition(
396
+ id=16,
397
+ display_name="NOI Statement",
398
+ category="real_estate",
399
+ keywords=[
400
+ "NET OPERATING INCOME", "NOI STATEMENT",
401
+ "PROPERTY OPERATING STATEMENT", "PROPERTY P&L",
402
+ "INCOME AND EXPENSE STATEMENT",
403
+ "REAL ESTATE OPERATING STATEMENT",
404
+ "rental revenue", "rental income", "gross potential rent",
405
+ "vacancy loss", "effective gross income",
406
+ "total operating expenses", "property expenses",
407
+ "operating costs", "operating profit",
408
+ "net operating income", "noi",
409
+ "income before debt service",
410
+ "management fees", "property management", "management expense",
411
+ ],
412
+ fields=[
413
+ "rental_revenue", "vacancy_loss", "operating_expenses",
414
+ "noi", "management_fees",
415
+ ],
416
+ ),
417
+
418
+ # ── 17. Occupancy Tax Schedule ──────────────────────────────────────
419
+ "occupancy_tax": DocTypeDefinition(
420
+ id=17,
421
+ display_name="Occupancy Tax Schedule",
422
+ category="hospitality",
423
+ keywords=[
424
+ "OCCUPANCY TAX RETURN", "TOT RETURN",
425
+ "TRANSIENT OCCUPANCY TAX", "LODGING TAX", "HOTEL TAX",
426
+ "ROOM TAX", "SHORT-TERM RENTAL TAX",
427
+ "room revenue", "lodging revenue",
428
+ "taxable receipts", "gross rents",
429
+ "tax due", "total tax", "remittance amount",
430
+ "tax payable", "non-taxable", "tax-exempt revenue",
431
+ "permanent residents", "rooms sold",
432
+ "occupancy percentage", "occupancy %",
433
+ ],
434
+ fields=[
435
+ "room_revenue", "tax_due", "occupancy_rate", "exemptions",
436
+ ],
437
+ ),
438
+
439
+ # ── 18. Fair Housing Compliance Audit ───────────────────────────────
440
+ "fair_housing": DocTypeDefinition(
441
+ id=18,
442
+ display_name="Fair Housing Compliance Audit",
443
+ category="compliance",
444
+ keywords=[
445
+ "FAIR HOUSING COMPLIANCE", "ADA AUDIT",
446
+ "ACCESSIBILITY CHECKLIST", "HUD REVIEW",
447
+ "EQUAL HOUSING OPPORTUNITY", "UFAS COMPLIANCE", "UFAS",
448
+ "AMERICANS WITH DISABILITIES ACT", "SECTION 504",
449
+ "fair housing", "violation", "deficiency", "non-compliance",
450
+ "finding", "corrective action", "remediation",
451
+ "action plan", "corrective measures",
452
+ ],
453
+ fields=[
454
+ "violation_status", "remediation_plan", "audit_date",
455
+ "property_address",
456
+ ],
457
+ ),
458
+
459
+ # ── 19. ASC 842 Lease Liability Schedule ────────────────────────────
460
+ "asc_842_lease": DocTypeDefinition(
461
+ id=19,
462
+ display_name="ASC 842 Lease Liability Schedule",
463
+ category="accounting",
464
+ keywords=[
465
+ "LEASE LIABILITY AMORTIZATION", "ROU ASSET ROLLFORWARD",
466
+ "ASC 842", "IFRS 16", "LEASE OBLIGATION",
467
+ "RIGHT OF USE ASSET", "OPERATING LEASE LIABILITY",
468
+ "FINANCE LEASE SCHEDULE",
469
+ "rou asset", "right of use asset", "operating lease asset",
470
+ "finance lease asset", "lease liability", "lease obligation",
471
+ "present value of payments", "present value of lease payments",
472
+ "future minimum lease payments", "single lease cost",
473
+ "lease expense", "amortization of rou",
474
+ "interest on lease liability",
475
+ "incremental borrowing rate", "ibr",
476
+ "rate implicit in the lease",
477
+ "discount rate",
478
+ ],
479
+ fields=[
480
+ "rou_asset", "lease_liability", "discount_rate",
481
+ "lease_payment", "lease_expense",
482
+ ],
483
+ ),
484
+
485
+ # ── 20. Fixed Asset Roll-forward (FAR) ──────────────────────────────
486
+ "fixed_asset_rollforward": DocTypeDefinition(
487
+ id=20,
488
+ display_name="Fixed Asset Roll-forward (FAR)",
489
+ category="accounting",
490
+ keywords=[
491
+ "FIXED ASSET ROLLFORWARD", "PPE SCHEDULE",
492
+ "DEPRECIATION SCHEDULE", "CHANGES IN CAPITAL ASSETS",
493
+ "CAPITAL ASSET MOVEMENT", "PLANT AND EQUIPMENT SCHEDULE",
494
+ "beginning net book value", "beginning nbv",
495
+ "opening balance", "beginning cost",
496
+ "additions", "purchases", "acquisitions", "asset acquisitions",
497
+ "disposals", "retirements", "sales", "write-offs",
498
+ "depreciation expense", "current depreciation",
499
+ "provision for depreciation",
500
+ "net book value", "closing balance", "ending net book value",
501
+ ],
502
+ fields=[
503
+ "asset_class", "beg_book_value", "additions", "disposals",
504
+ "depreciation_exp",
505
+ ],
506
+ ),
507
+
508
+ # ── 21. CapEx Reserve Schedule ──────────────────────────────────────
509
+ "capex_reserve": DocTypeDefinition(
510
+ id=21,
511
+ display_name="CapEx Reserve Schedule",
512
+ category="real_estate",
513
+ keywords=[
514
+ "CAPEX RESERVE", "REPLACEMENT RESERVE",
515
+ "FF&E RESERVE", "CAPITAL IMPROVEMENT BUDGET",
516
+ "SINKING FUND", "RESERVE FOR REPLACEMENT",
517
+ "RESERVE FOR REPLACEMENTS", "CAPITAL RESERVE STUDY",
518
+ "beginning reserve balance", "reserve balance", "fund balance",
519
+ "escrow balance", "monthly deposit", "contribution",
520
+ "scheduled deposit", "transfer to reserve",
521
+ "draws", "disbursements from reserve",
522
+ "reimbursements", "reserve releases", "withdrawals",
523
+ ],
524
+ fields=[
525
+ "reserve_balance", "monthly_deposit", "draws",
526
+ "approved_projects",
527
+ ],
528
+ ),
529
+
530
+ # ── 22. Cost of Goods Manufactured (COGM) ──────────────────────────
531
+ "cogm": DocTypeDefinition(
532
+ id=22,
533
+ display_name="Cost of Goods Manufactured (COGM)",
534
+ category="manufacturing",
535
+ keywords=[
536
+ "COST OF GOODS MANUFACTURED", "MANUFACTURING STATEMENT",
537
+ "PRODUCTION COST SCHEDULE", "MANUFACTURING EXPENSES",
538
+ "SCHEDULE OF COGM", "STATEMENT OF COST OF GOODS MANUFACTURED",
539
+ "COGM SCHEDULE",
540
+ "direct materials", "raw materials consumed",
541
+ "direct material consumed", "material costs",
542
+ "direct labor", "manufacturing labor", "touch labor",
543
+ "direct manufacturing labor",
544
+ "factory overhead", "manufacturing overhead",
545
+ "indirect costs", "burden", "manufacturing burden",
546
+ "beginning work in process", "beginning wip",
547
+ "ending work in process", "ending wip",
548
+ "cost of goods manufactured", "cogm", "manufacturing costs",
549
+ ],
550
+ fields=[
551
+ "direct_materials", "direct_labor", "factory_overhead",
552
+ "opening_wip", "closing_wip", "cost_goods_mfd",
553
+ ],
554
+ ),
555
+
556
+ # ── 23. Production Variance Report ──────────────────────────────────
557
+ "production_variance": DocTypeDefinition(
558
+ id=23,
559
+ display_name="Production Variance Report",
560
+ category="manufacturing",
561
+ keywords=[
562
+ "PRODUCTION VARIANCE", "MATERIAL USAGE VARIANCE",
563
+ "STANDARD COST VARIANCE", "COST VARIANCE ANALYSIS",
564
+ "PRODUCTION VARIANCE REPORT",
565
+ "standard cost", "planned cost", "expected cost",
566
+ "actual cost", "incurred cost",
567
+ "variance", "difference", "favorable/unfavorable",
568
+ "price variance", "usage variance", "volume variance",
569
+ "efficiency variance", "rate variance",
570
+ "yield variance", "labor efficiency variance",
571
+ "overhead variance",
572
+ ],
573
+ fields=[
574
+ "item_id", "standard_cost", "actual_cost",
575
+ "variance_amount", "variance_type",
576
+ ],
577
+ ),
578
+
579
+ # ── 24. WIP Inventory Valuation ─────────────────────────────────────
580
+ "wip_valuation": DocTypeDefinition(
581
+ id=24,
582
+ display_name="WIP Inventory Valuation",
583
+ category="manufacturing",
584
+ keywords=[
585
+ "WIP VALUATION", "WORK IN PROCESS INVENTORY",
586
+ "JOB COST REPORT", "PROJECT COST SUMMARY",
587
+ "UNBILLED COSTS", "WIP AGING",
588
+ "CONSTRUCTION-IN-PROGRESS",
589
+ "job number", "project id", "work order", "job name",
590
+ "costs to date", "total costs", "cumulative costs",
591
+ "billed to date", "amounts billed", "progress billings",
592
+ "wip balance", "net wip", "costs in excess of billings",
593
+ "unbilled work", "unbilled costs",
594
+ "% complete", "poc", "completion %", "percentage of completion",
595
+ ],
596
+ fields=[
597
+ "job_id", "costs_to_date", "billing_to_date",
598
+ "wip_balance", "percent_complete",
599
+ ],
600
+ ),
601
+
602
+ # ── 25. Open Purchase Order (PO) Log ────────────────────────────────
603
+ "open_po_log": DocTypeDefinition(
604
+ id=25,
605
+ display_name="Open Purchase Order (PO) Log",
606
+ category="procurement",
607
+ keywords=[
608
+ "OPEN PURCHASE ORDERS", "PO STATUS REPORT",
609
+ "OUTSTANDING ORDERS", "VENDOR COMMITMENTS",
610
+ "UNRECEIVED PO", "PURCHASE ORDER LOG",
611
+ "OPEN PO BY VENDOR",
612
+ "po number", "purchase order", "po #", "order no",
613
+ "order date", "purchase order date", "po date",
614
+ "vendor", "supplier", "manufacturer", "vendor name",
615
+ "expected delivery", "delivery date", "eta", "due date",
616
+ "promised date", "open amount", "remaining balance",
617
+ "outstanding balance", "unreceived amount",
618
+ ],
619
+ fields=[
620
+ "po_number", "vendor", "order_date",
621
+ "expected_delivery", "open_amount",
622
+ ],
623
+ ),
624
+
625
+ # ── 26. OSHA Incident Logs ──────────────────────────────────────────
626
+ "osha_incidents": DocTypeDefinition(
627
+ id=26,
628
+ display_name="OSHA Incident Logs",
629
+ category="safety",
630
+ keywords=[
631
+ "OSHA 300", "LOG OF WORK-RELATED INJURIES",
632
+ "OSHA FORM 300A", "OSHA 301",
633
+ "INJURY AND ILLNESS INCIDENT REPORT",
634
+ "SUMMARY OF WORK-RELATED INJURIES AND ILLNESSES",
635
+ "case number", "osha case id", "case no",
636
+ "employee's name", "injured worker", "name",
637
+ "date of injury", "incident date", "date of onset",
638
+ "description of injury", "injury type",
639
+ "nature of injury", "classification",
640
+ "days away from work", "job transfer",
641
+ "other recordable", "death",
642
+ ],
643
+ fields=[
644
+ "case_number", "employee_name", "incident_date",
645
+ "description", "incident_type", "classification",
646
+ ],
647
+ ),
648
+
649
+ # ── 27. Environmental Health & Safety (EHS) Logs ────────────────────
650
+ "ehs_logs": DocTypeDefinition(
651
+ id=27,
652
+ display_name="Environmental Health & Safety (EHS) Logs",
653
+ category="safety",
654
+ keywords=[
655
+ "EHS INCIDENT LOG", "ENVIRONMENTAL HEALTH SAFETY",
656
+ "INCIDENT TRACKER", "SPILL REPORT", "NEAR MISS REPORT",
657
+ "ENVIRONMENTAL INCIDENT REPORT",
658
+ "SAFETY OBSERVATION",
659
+ "incident id", "report number", "incident number",
660
+ "type", "category", "incident category",
661
+ "Spill", "Release", "Near Miss",
662
+ "location", "site", "facility", "area",
663
+ "severity", "impact level", "severity rating",
664
+ ],
665
+ fields=[
666
+ "incident_id", "location", "severity",
667
+ "corrective_action",
668
+ ],
669
+ ),
670
+
671
+ # ── 28. Health & Safety Inspection Logs ─────────────────────────────
672
+ "safety_inspections": DocTypeDefinition(
673
+ id=28,
674
+ display_name="Health & Safety Inspection Logs",
675
+ category="safety",
676
+ keywords=[
677
+ "SAFETY INSPECTION", "SITE SAFETY AUDIT",
678
+ "WORKPLACE INSPECTION", "SAFETY CHECKLIST",
679
+ "JSA (Job Safety Analysis)", "JOB SAFETY ANALYSIS",
680
+ "INSPECTION REPORT", "HAZARD ASSESSMENT",
681
+ "inspector", "auditor", "checked by", "inspected by",
682
+ "inspection date", "audit date", "date of inspection",
683
+ "hazard", "finding", "deficiency",
684
+ "risk", "priority", "criticality",
685
+ "risk level", "risk rating",
686
+ "status", "open/closed", "compliance", "compliance status",
687
+ ],
688
+ fields=[
689
+ "inspector", "inspection_date", "hazard_identified",
690
+ "risk_level", "status",
691
+ ],
692
+ ),
693
+
694
+ # ── 29. Reserve Report (PV-10) ──────────────────────────────────────
695
+ "reserve_report": DocTypeDefinition(
696
+ id=29,
697
+ display_name="Reserve Report (PV-10)",
698
+ category="oil_gas",
699
+ keywords=[
700
+ "RESERVE REPORT", "PV-10",
701
+ "PETROLEUM ENGINEERING REPORT",
702
+ "SEC RESERVES", "STANDARDIZED MEASURE",
703
+ "proved reserves", "proved developed and undeveloped",
704
+ "1p reserves", "total proved",
705
+ "oil (mbbls)", "oil barrels", "crude oil volume",
706
+ "condensate", "plant products",
707
+ "natural gas", "gas (mmcf)", "natural gas volume",
708
+ "future net revenue", "future net cash flows",
709
+ "discounted net revenue @ 10%", "pv-10",
710
+ "present value at 10%", "undiscounted cash flow",
711
+ "discounted future net cash flows",
712
+ "standardized measure",
713
+ ],
714
+ fields=[
715
+ "proved_reserves", "oil_volume", "gas_volume",
716
+ "future_net_revenue", "pv10_value",
717
+ ],
718
+ ),
719
+
720
+ # ── 30. Joint Interest Billing (JIB) ────────────────────────────────
721
+ "joint_interest_billing": DocTypeDefinition(
722
+ id=30,
723
+ display_name="Joint Interest Billing (JIB)",
724
+ category="oil_gas",
725
+ keywords=[
726
+ "JOINT INTEREST BILLING", "JIB STATEMENT",
727
+ "JOINT ACCOUNT", "PARTNER BILLING",
728
+ "OPERATING STATEMENT JOINT VENTURE",
729
+ "OIL AND GAS JIB",
730
+ "working interest", "wi %", "decimal interest", "wi",
731
+ "afe", "authority for expenditure", "afe number",
732
+ "project id", "lease", "well name", "property", "field",
733
+ "gross amount", "100% share", "total costs",
734
+ "your share", "owner's share", "net amount", "amount due",
735
+ "total joint account",
736
+ ],
737
+ fields=[
738
+ "afe_number", "working_interest", "expense_category",
739
+ "total_billings", "owner_share",
740
+ ],
741
+ ),
742
+
743
+ # ── 31. LOE Statement ───────────────────────────────────────────────
744
+ "loe_statement": DocTypeDefinition(
745
+ id=31,
746
+ display_name="LOE Statement (Lease Operating Expenses)",
747
+ category="oil_gas",
748
+ keywords=[
749
+ "LEASE OPERATING EXPENSES", "LOE STATEMENT",
750
+ "DIRECT OPERATING EXPENSE", "PRODUCTION COSTS",
751
+ "WELL EXPENSE", "WELL OPERATING STATEMENT",
752
+ "pumper labor", "contract labor", "operating labor",
753
+ "supervision", "chemicals", "treating", "methanol",
754
+ "chemical treatment", "salt water disposal", "swd",
755
+ "water disposal", "hauling", "workover",
756
+ "repairs & maintenance", "r&m", "maintenance",
757
+ "production costs", "direct operating expense",
758
+ ],
759
+ fields=[
760
+ "lease_name", "labor_cost", "chemicals",
761
+ "water_disposal", "repairs",
762
+ ],
763
+ ),
764
+
765
+ # ── 32. FERC Form 1 ────────────────────────────────────────────────
766
+ "ferc_form_1": DocTypeDefinition(
767
+ id=32,
768
+ display_name="FERC Form 1",
769
+ category="utilities",
770
+ keywords=[
771
+ "FERC FORM 1", "ELECTRIC UTILITY ANNUAL REPORT",
772
+ "MAJOR ELECTRIC UTILITY",
773
+ "FEDERAL ENERGY REGULATORY COMMISSION",
774
+ "FERC ACCOUNTS",
775
+ "respondent_name", "exact legal name of respondent",
776
+ "reporting entity", "utility name",
777
+ "electric plant in service", "plant in service",
778
+ "utility plant", "electric operating revenues",
779
+ "total operating revenue", "sales of electricity",
780
+ "megawatt hours", "mwh sold", "megawatt-hours sold",
781
+ "energy sales (mwh)",
782
+ ],
783
+ fields=[
784
+ "respondent_name", "year_ended", "plant_in_service",
785
+ "operating_revenues", "mwh_sold",
786
+ ],
787
+ ),
788
+
789
+ # ── 33. ARO Schedule ────────────────────────────────────────────────
790
+ "aro_schedule": DocTypeDefinition(
791
+ id=33,
792
+ display_name="ARO Schedule (Asset Retirement Obligation)",
793
+ category="accounting",
794
+ keywords=[
795
+ "ASSET RETIREMENT OBLIGATION", "ARO LIABILITY",
796
+ "ACCRETION SCHEDULE", "DECOMMISSIONING PROVISION",
797
+ "RECLAMATION LIABILITY", "FAS 143", "ASC 410",
798
+ "beginning aro", "opening aro", "beginning aro liability",
799
+ "accretion", "accretion expense", "accretion of discount",
800
+ "interest cost",
801
+ "new wells", "additions", "new aros",
802
+ "liabilities incurred", "new obligations",
803
+ "settlements", "obligations settled",
804
+ "remediation payments",
805
+ "ending balance", "closing aro", "ending aro liability",
806
+ ],
807
+ fields=[
808
+ "beginning_liability", "accretion_expense",
809
+ "new_obligations", "settlements", "ending_liability",
810
+ ],
811
+ ),
812
+
813
+ # ── 34. IFTA Fuel Tax Report ────────────────────────────────────────
814
+ "ifta_fuel_tax": DocTypeDefinition(
815
+ id=34,
816
+ display_name="IFTA Fuel Tax Report",
817
+ category="transportation",
818
+ keywords=[
819
+ "IFTA REPORT", "FUEL TAX RETURN",
820
+ "INTERNATIONAL FUEL TAX AGREEMENT",
821
+ "IFTA-100", "IFTA RETURN", "QUARTERLY FUEL TAX",
822
+ "jurisdiction", "state", "province", "member jurisdiction",
823
+ "total miles", "taxable miles", "distance traveled",
824
+ "total kilometers", "taxable distance",
825
+ "jurisdictional miles",
826
+ "fuel purchased", "gallons pumped", "gallons consumed",
827
+ "tax paid gallons", "fuel consumed", "fuel usage",
828
+ "total fuel", "gallons used",
829
+ "mpg", "fleet average mpg", "kpl", "average fuel economy",
830
+ "tax due", "refund", "net amount", "tax payable/(refund)",
831
+ ],
832
+ fields=[
833
+ "jurisdiction", "total_miles", "tax_paid_gallons",
834
+ "mpg", "net_tax_due",
835
+ ],
836
+ ),
837
+
838
+ # ── 35. DOT Hours of Service Logs ───────────────────────────────────
839
+ "dot_hos": DocTypeDefinition(
840
+ id=35,
841
+ display_name="DOT Hours of Service Logs",
842
+ category="transportation",
843
+ keywords=[
844
+ "DRIVER'S DAILY LOG", "HOURS OF SERVICE",
845
+ "RECORD OF DUTY STATUS", "ELD REPORT", "RODS",
846
+ "HOS LOG", "FMCSA",
847
+ "driver id", "driver name", "license number", "employee id",
848
+ "driving", "drive time", "hours driving",
849
+ "on duty", "on duty not driving", "off duty",
850
+ "sleeper berth", "rest hours",
851
+ "line 1", "line 3", "line 4",
852
+ "11 hour rule", "14 hour rule",
853
+ "hours of service violations", "hos violations",
854
+ "11 hour rule exceeded", "14 hour rule exceeded",
855
+ ],
856
+ fields=[
857
+ "driver_id", "driving_hours", "on_duty_hours",
858
+ "off_duty_hours", "violations",
859
+ ],
860
+ ),
861
+
862
+ # ── 36. Fleet Utilization Report ────────────────────────────────────
863
+ "fleet_utilization": DocTypeDefinition(
864
+ id=36,
865
+ display_name="Fleet Utilization Report",
866
+ category="transportation",
867
+ keywords=[
868
+ "FLEET UTILIZATION", "ASSET UTILIZATION",
869
+ "FLEET PRODUCTIVITY", "ODOMETER REPORT",
870
+ "vehicle id", "truck id", "fleet number", "asset id",
871
+ "utilization %", "active days %", "vehicle utilization",
872
+ "miles driven", "mileage", "total miles", "distance",
873
+ "idle time", "idling hours", "engine idle time",
874
+ "truck uptime", "vehicle activity",
875
+ ],
876
+ fields=[
877
+ "vehicle_id", "utilization_percent", "miles_driven",
878
+ "idle_time", "fuel_consumed",
879
+ ],
880
+ ),
881
+
882
+ # ── 37. Lane/Route Profitability ────────────────────────────────────
883
+ "lane_profitability": DocTypeDefinition(
884
+ id=37,
885
+ display_name="Lane/Route Profitability",
886
+ category="transportation",
887
+ keywords=[
888
+ "LANE PROFITABILITY", "ROUTE MARGIN",
889
+ "NET REVENUE PER LANE", "LOAD PROFITABILITY",
890
+ "FREIGHT MARGIN", "TRIP P&L",
891
+ "lane", "lane id", "route", "shipping lane",
892
+ "origin-destination", "o/d pair",
893
+ "revenue per mile", "rpm", "rate per mile",
894
+ "revenue/mile",
895
+ "cost per mile", "cpm", "operating cost/mile",
896
+ "total cost per mile",
897
+ "deadhead miles", "empty miles", "non-revenue miles",
898
+ "lane profit", "contribution margin", "net margin per mile",
899
+ "freight margin", "load profitability",
900
+ ],
901
+ fields=[
902
+ "lane_id", "revenue_per_mile", "cost_per_mile",
903
+ "deadhead", "margin",
904
+ ],
905
+ ),
906
+
907
+ # ── 38. Fuel Surcharge Schedule ─────────────────────────────────────
908
+ "fuel_surcharge": DocTypeDefinition(
909
+ id=38,
910
+ display_name="Fuel Surcharge Schedule",
911
+ category="transportation",
912
+ keywords=[
913
+ "FUEL SURCHARGE SCHEDULE", "FSC MATRIX",
914
+ "FUEL ADJUSTMENT FACTOR", "SURCHARGE TABLE",
915
+ "DOE DIESEL AVERAGE",
916
+ "fuel price", "peg price", "fuel index price",
917
+ "fuel index source", "doe average",
918
+ "price per gallon",
919
+ "surcharge per mile", "fsc rate", "surcharge percentage",
920
+ "adjustment factor", "surcharge rate",
921
+ "effective date", "week of", "effective period",
922
+ ],
923
+ fields=[
924
+ "fuel_price_range", "surcharge_rate", "effective_date",
925
+ "index_used",
926
+ ],
927
+ ),
928
+
929
+ # ── 39. Tip Reporting (Form 8027) ───────────────────────────────────
930
+ "tip_reporting": DocTypeDefinition(
931
+ id=39,
932
+ display_name="Tip Reporting (Form 8027)",
933
+ category="hospitality",
934
+ keywords=[
935
+ "FORM 8027", "TIP INCOME REPORT",
936
+ "EMPLOYER'S ANNUAL INFORMATION RETURN OF TIP INCOME",
937
+ "REPORT OF TIP INCOME", "ALLOCATED TIPS",
938
+ "GROSS RECEIPTS FROM FOOD",
939
+ "GROSS RECEIPTS FROM FOOD OR BEVERAGES",
940
+ "name of establishment", "restaurant name", "business name",
941
+ "establishment_name",
942
+ "charged tips", "credit card tips", "tips on charge receipts",
943
+ "cash tips", "direct cash tips",
944
+ "allocated tips", "allocated tip income",
945
+ "gross receipts", "food and beverage sales",
946
+ "gross sales", "total sales", "total revenue",
947
+ "tip shortfall", "shortfall",
948
+ ],
949
+ fields=[
950
+ "establishment_name", "gross_receipts", "charged_tips",
951
+ "cash_tips", "allocated_tips",
952
+ ],
953
+ ),
954
+
955
+ # ── 40. Daily Revenue Report ────────────────────────────────────────
956
+ "daily_revenue": DocTypeDefinition(
957
+ id=40,
958
+ display_name="Daily Revenue Report",
959
+ category="hospitality",
960
+ keywords=[
961
+ "DAILY SALES REPORT", "DSR", "FLASH REPORT",
962
+ "DAILY REVENUE TRACKER", "NIGHT AUDIT",
963
+ "NIGHT AUDIT REPORT", "POS SUMMARY",
964
+ "DAILY HIGHLIGHTS",
965
+ "date", "business day", "sales date",
966
+ "gross sales", "net sales", "total revenue", "daily revenue",
967
+ "discounts", "comps", "voids",
968
+ "promotional allowances", "promos",
969
+ "cash", "credit", "amex", "visa", "mastercard",
970
+ "credit card sales", "cash sales", "tender summary",
971
+ "covers", "guest count", "transactions", "customer count",
972
+ "net revenue", "sales net of discounts",
973
+ ],
974
+ fields=[
975
+ "date", "gross_sales", "discounts", "net_sales",
976
+ "payment_methods", "covers",
977
+ ],
978
+ ),
979
+
980
+ # ── 41. Departmental P&L ────────────────────────────────────────────
981
+ "departmental_pl": DocTypeDefinition(
982
+ id=41,
983
+ display_name="Departmental P&L",
984
+ category="operations",
985
+ keywords=[
986
+ "DEPARTMENTAL INCOME STATEMENT", "PROFIT AND LOSS BY DEPARTMENT",
987
+ "SEGMENT REPORTING", "COST CENTER REPORT",
988
+ "CONTRIBUTION BY DEPARTMENT", "DIVISIONAL P&L",
989
+ "department", "division", "cost center", "business unit",
990
+ "segment", "department_name",
991
+ "revenue", "sales", "departmental sales", "segment revenue",
992
+ "intercompany revenue",
993
+ "direct expenses", "departmental expenses",
994
+ "controllable expenses",
995
+ "contribution margin", "departmental income",
996
+ "departmental contribution", "segment profit",
997
+ ],
998
+ fields=[
999
+ "department_name", "dept_revenue", "direct_expenses",
1000
+ "contribution_margin",
1001
+ ],
1002
+ ),
1003
+
1004
+ # ── 42. Balance Sheet ───────────────────────────────────────────────
1005
+ "balance_sheet": DocTypeDefinition(
1006
+ id=42,
1007
+ display_name="Balance Sheet",
1008
+ category="core_financial",
1009
+ keywords=[
1010
+ "CONSOLIDATED BALANCE SHEETS",
1011
+ "STATEMENT OF FINANCIAL POSITION",
1012
+ "ASSETS LIABILITIES AND EQUITY",
1013
+ "LIABILITIES AND STOCKHOLDERS' EQUITY",
1014
+ "cash", "accounts receivable", "inventory",
1015
+ "prepaid expenses", "property plant and equipment",
1016
+ "accumulated depreciation", "intangible assets",
1017
+ "accounts payable", "accrued liabilities",
1018
+ "short-term debt", "long-term debt", "deferred revenue",
1019
+ "total equity", "shareholders' equity",
1020
+ "retained earnings",
1021
+ "total assets", "total liabilities",
1022
+ ],
1023
+ fields=[
1024
+ "Assets", "Liabilities", "Equity",
1025
+ ],
1026
+ ),
1027
+
1028
+ # ── 43. Income Statement ────────────────────────────────────────────
1029
+ "income_statement": DocTypeDefinition(
1030
+ id=43,
1031
+ display_name="Income Statement",
1032
+ category="core_financial",
1033
+ keywords=[
1034
+ "CONSOLIDATED STATEMENTS OF OPERATIONS",
1035
+ "STATEMENTS OF INCOME", "STATEMENT OF EARNINGS",
1036
+ "PROFIT AND LOSS", "P&L",
1037
+ "revenue", "sales", "net sales", "total revenue",
1038
+ "cost of goods sold", "cogs", "cost of sales",
1039
+ "gross profit", "operating expenses",
1040
+ "marketing", "payroll", "rent", "depreciation",
1041
+ "amortization", "interest expense",
1042
+ "operating income", "net income", "net profit",
1043
+ "net earnings", "ebitda",
1044
+ ],
1045
+ fields=[
1046
+ "revenue", "cogs", "operating_expenses",
1047
+ "net_income",
1048
+ ],
1049
+ ),
1050
+
1051
+ # ── 44. Cash Flow Statement ─────────────────────────────────────────
1052
+ "cash_flow_statement": DocTypeDefinition(
1053
+ id=44,
1054
+ display_name="Cash Flow Statement",
1055
+ category="core_financial",
1056
+ keywords=[
1057
+ "CONSOLIDATED STATEMENTS OF CASH FLOWS",
1058
+ "STATEMENT OF CASH FLOWS", "CASH FLOW STATEMENT",
1059
+ "net cash provided by operating activities",
1060
+ "cash from operations", "operating activities",
1061
+ "cash used in investing activities", "investing activities",
1062
+ "cash from investing",
1063
+ "cash provided by financing", "financing activities",
1064
+ "cash from financing activities",
1065
+ "net change in cash", "net increase/(decrease)",
1066
+ "capital expenditures", "purchase of property and equipment",
1067
+ "capex",
1068
+ ],
1069
+ fields=[
1070
+ "operating_cash_flow", "investing_cash_flow",
1071
+ "financing_cash_flow", "capex",
1072
+ ],
1073
+ ),
1074
+
1075
+ # ── 45. Statement of Shareholders' Equity ──────────────────────────
1076
+ "shareholders_equity": DocTypeDefinition(
1077
+ id=45,
1078
+ display_name="Statement of Shareholders' Equity",
1079
+ category="core_financial",
1080
+ keywords=[
1081
+ "STATEMENT OF STOCKHOLDERS EQUITY",
1082
+ "CHANGES IN EQUITY", "RETAINED EARNINGS STATEMENT",
1083
+ "CONSOLIDATED STATEMENT OF CHANGES IN EQUITY",
1084
+ "beginning balance", "balance at beginning of period",
1085
+ "opening equity", "beginning equity",
1086
+ "net income", "comprehensive income", "net profit",
1087
+ "dividends", "distributions", "dividends declared",
1088
+ "issuance of common stock", "exercise of options",
1089
+ "exercise of stock options",
1090
+ "share-based compensation", "stock-based compensation",
1091
+ "repurchase", "treasury stock", "share buybacks",
1092
+ "repurchase of common stock",
1093
+ "ending balance", "balance at end of period", "closing equity",
1094
+ ],
1095
+ fields=[
1096
+ "beginning_equity", "net_income", "dividends",
1097
+ "stock_issuance", "stock_repurchase", "ending_equity",
1098
+ ],
1099
+ ),
1100
+
1101
+ # ── 46. Budget vs. Actuals (BvA) Report ─────────────────────────────
1102
+ "budget_vs_actuals": DocTypeDefinition(
1103
+ id=46,
1104
+ display_name="Budget vs. Actuals (BvA) Report",
1105
+ category="budgeting",
1106
+ keywords=[
1107
+ "BUDGET VS ACTUAL", "VARIANCE REPORT",
1108
+ "BVA", "FORECAST VS ACTUAL", "PLAN VS ACTUAL",
1109
+ "MANAGEMENT REPORT", "MONTHLY PERFORMANCE",
1110
+ "actual", "budget", "plan", "forecast", "target",
1111
+ "budgeted amount", "current period actual",
1112
+ "mtd actual", "ytd actual", "actuals",
1113
+ "variance $", "variance %", "var", "diff",
1114
+ "over/under", "over/(under)", "amount variance",
1115
+ "percentage variance", "% diff",
1116
+ "favorable", "unfavorable",
1117
+ ],
1118
+ fields=[
1119
+ "gl_account", "actual_amount", "budget_amount",
1120
+ "variance_amount", "variance_percent",
1121
+ ],
1122
+ ),
1123
+
1124
+ # ── 47. Aged Accounts Receivable (AR) Report ───────────────────────
1125
+ "aged_ar": DocTypeDefinition(
1126
+ id=47,
1127
+ display_name="Aged Accounts Receivable (AR) Report",
1128
+ category="credit",
1129
+ keywords=[
1130
+ "AGED RECEIVABLES", "AR AGING",
1131
+ "AGED TRIAL BALANCE RECEIVABLES",
1132
+ "OPEN INVOICE REPORT", "RECEIVABLES AGING SUMMARY",
1133
+ "customer", "client", "customer name", "client",
1134
+ "current", "not due", "<30", "<30 days",
1135
+ "0-30", "1-30 days", "30-60", "31-60",
1136
+ "60-90", "61-90", "90+", "over 90", ">90",
1137
+ "past due 90+", "91+ days",
1138
+ "total receivable", "total due", "balance",
1139
+ "amount due", "open invoice report",
1140
+ ],
1141
+ fields=[
1142
+ "customer_name", "current_bucket", "bucket_30_60",
1143
+ "bucket_60_90", "bucket_90_plus", "total_due",
1144
+ ],
1145
+ ),
1146
+
1147
+ # ── 48. Aged Accounts Payable (AP) Report ──────────────────────────
1148
+ "aged_ap": DocTypeDefinition(
1149
+ id=48,
1150
+ display_name="Aged Accounts Payable (AP) Report",
1151
+ category="credit",
1152
+ keywords=[
1153
+ "AGED PAYABLES", "AP AGING",
1154
+ "AGED TRIAL BALANCE PAYABLES", "VENDOR AGING",
1155
+ "PAYABLES AGING SUMMARY",
1156
+ "vendor", "supplier", "vendor name",
1157
+ "current", "not due", "<30 days",
1158
+ "0-30", "30-60", "31-60", "60-90", "61-90",
1159
+ "90+", "over 90", ">90", ">90 days",
1160
+ "total accounts payable", "balance",
1161
+ "amount payable", "total liability",
1162
+ ],
1163
+ fields=[
1164
+ "vendor_name", "current_bucket", "bucket_30_60",
1165
+ "bucket_60_90", "bucket_90_plus",
1166
+ ],
1167
+ ),
1168
+
1169
+ # ── 49. Headcount & Payroll Register ────────────────────────────────
1170
+ "headcount_payroll": DocTypeDefinition(
1171
+ id=49,
1172
+ display_name="Headcount & Payroll Register",
1173
+ category="hr",
1174
+ keywords=[
1175
+ "HEADCOUNT REPORT", "PAYROLL REGISTER",
1176
+ "CENSUS", "FTE REPORT", "EMPLOYEE LIST",
1177
+ "SALARY ROSTER", "EMPLOYEE CENSUS",
1178
+ "PAYROLL SUMMARY",
1179
+ "employee id", "eeid", "employee number", "personnel number",
1180
+ "file number",
1181
+ "department", "cost center", "division", "unit",
1182
+ "annual salary", "base salary", "base rate", "hourly rate",
1183
+ "gross pay", "gross wages", "total earnings", "total compensation",
1184
+ "net pay", "net amount", "take home", "take home pay",
1185
+ "check amount",
1186
+ "deductions", "taxes withheld", "401k",
1187
+ "benefit deductions", "withholdings",
1188
+ ],
1189
+ fields=[
1190
+ "employee_id", "department", "salary", "gross_pay",
1191
+ "net_pay", "deductions",
1192
+ ],
1193
+ ),
1194
+
1195
+ # ── 50. Debt Schedule & Covenant Compliance ────────────────────────
1196
+ "debt_covenant": DocTypeDefinition(
1197
+ id=50,
1198
+ display_name="Debt Schedule & Covenant Compliance",
1199
+ category="treasury",
1200
+ keywords=[
1201
+ "COMPLIANCE CERTIFICATE", "COVENANT CALCULATIONS",
1202
+ "DEBT SERVICE COVERAGE RATIO", "LEVERAGE RATIO",
1203
+ "BORROWING BASE CERTIFICATE", "FIXED CHARGE COVERAGE",
1204
+ "LOAN AGREEMENT COMPLIANCE", "DSCR",
1205
+ "covenant", "financial requirement",
1206
+ "debt service coverage ratio", "current ratio",
1207
+ "leverage ratio", "fixed charge coverage",
1208
+ "minimum", "maximum", "threshold", "required",
1209
+ "calculated", "actual", "current period",
1210
+ "pass/fail", "in compliance", "compliant", "met",
1211
+ "outstanding principal", "loan balance",
1212
+ "debt outstanding", "ending debt balance", "debt schedule",
1213
+ ],
1214
+ fields=[
1215
+ "covenant_name", "required_value", "actual_value",
1216
+ "compliance_status", "debt_balance",
1217
+ ],
1218
+ ),
1219
+
1220
+ # ── 51. Tax Provision Workpapers (ASC 740) ──────────────────────────
1221
+ "tax_provision": DocTypeDefinition(
1222
+ id=51,
1223
+ display_name="Tax Provision Workpapers (ASC 740)",
1224
+ category="tax",
1225
+ keywords=[
1226
+ "TAX PROVISION", "ASC 740", "INCOME TAX PROVISION",
1227
+ "DEFERRED TAX ASSET", "DEFERRED TAX LIABILITY",
1228
+ "EFFECTIVE TAX RATE", "FAS 109",
1229
+ "pre-tax income", "book income", "income before taxes",
1230
+ "permanent differences", "non-deductible",
1231
+ "non-deductible expenses", "meals and entertainment",
1232
+ "non-taxable income",
1233
+ "temporary differences", "timing differences",
1234
+ "depreciation adjustment", "accruals",
1235
+ "current expense", "current provision",
1236
+ "current tax expense", "current taxes payable",
1237
+ "deferred expense", "deferred provision",
1238
+ "deferred tax expense", "change in deferred taxes",
1239
+ ],
1240
+ fields=[
1241
+ "pre_tax_income", "permanent_diff", "temporary_diff",
1242
+ "current_tax_expense", "deferred_tax_expense",
1243
+ ],
1244
+ ),
1245
+
1246
+ # ── 52. Capital Expenditure (CapEx) Budget ──────────────────────────
1247
+ "capex_budget": DocTypeDefinition(
1248
+ id=52,
1249
+ display_name="Capital Expenditure (CapEx) Budget",
1250
+ category="budgeting",
1251
+ keywords=[
1252
+ "CAPEX BUDGET", "CAPITAL PLAN", "INVESTMENT BUDGET",
1253
+ "CAPITAL SPENDING PLAN", "CAPITAL FORECAST",
1254
+ "PROJECT BUDGET", "LONG-RANGE CAPITAL PLAN",
1255
+ "project", "project name", "initiative", "item description",
1256
+ "investment", "capex category", "asset type", "category",
1257
+ "budget", "budgeted amount", "approved amount",
1258
+ "authorized amount", "total cost",
1259
+ "spend to date", "incurred", "incurred to date",
1260
+ "cumulative spend", "actuals",
1261
+ "remaining budget", "remaining", "remaining funds",
1262
+ "available", "balance",
1263
+ ],
1264
+ fields=[
1265
+ "project_name", "budget_amount", "spend_to_date",
1266
+ "remaining_budget", "asset_category",
1267
+ ],
1268
+ ),
1269
+
1270
+ # ── 53. Insurance Policy Declarations ───────────────────────────────
1271
+ "insurance_declarations": DocTypeDefinition(
1272
+ id=53,
1273
+ display_name="Insurance Policy Declarations",
1274
+ category="risk",
1275
+ keywords=[
1276
+ "DECLARATIONS PAGE", "CERTIFICATE OF INSURANCE",
1277
+ "POLICY DECLARATIONS", "POLICY NUMBER",
1278
+ "SCHEDULE OF COVERAGE", "SCHEDULE OF COVERAGES AND LIMITS",
1279
+ "INSURANCE POLICY", "LIMITS OF LIABILITY",
1280
+ "policy number", "policy no", "policy id",
1281
+ "named insured", "insured", "policyholder",
1282
+ "policy period", "effective dates",
1283
+ "policy effective date", "policy expiration date",
1284
+ "premium", "total premium", "annual premium",
1285
+ "policy cost", "cost",
1286
+ "coverage", "coverage part", "type of insurance",
1287
+ "General Liability", "Auto Liability",
1288
+ "Workers Compensation", "Property",
1289
+ "occurrence limit", "aggregate limit",
1290
+ "each occurrence limit", "policy limit", "coverage limit",
1291
+ "limit of liability", "limits of liability",
1292
+ ],
1293
+ fields=[
1294
+ "policy_number", "insured_name", "policy_period",
1295
+ "premium", "coverage_type", "limit_amount",
1296
+ ],
1297
+ ),
1298
+ }
1299
+
1300
+
1301
+ # ============================================================================
1302
+ # CATEGORY DISPLAY NAMES (for frontend grouping)
1303
+ # ============================================================================
1304
+
1305
+ CATEGORY_NAMES: Dict[str, str] = {
1306
+ "compliance": "Compliance & Audit",
1307
+ "saas_metrics": "SaaS / Subscription Metrics",
1308
+ "revenue_recognition": "Revenue Recognition",
1309
+ "sales": "Sales & Bookings",
1310
+ "tax": "Tax & Filings",
1311
+ "inventory": "Inventory Management",
1312
+ "retail": "Retail & Merchandising",
1313
+ "treasury": "Treasury & Cash Management",
1314
+ "real_estate": "Real Estate",
1315
+ "hospitality": "Hospitality & F&B",
1316
+ "accounting": "Accounting & Leases",
1317
+ "manufacturing": "Manufacturing & Production",
1318
+ "procurement": "Procurement & Purchasing",
1319
+ "safety": "Health, Safety & Environmental",
1320
+ "oil_gas": "Oil, Gas & Energy",
1321
+ "utilities": "Utilities & Regulated",
1322
+ "transportation": "Transportation & Fleet",
1323
+ "core_financial": "Core Financial Statements",
1324
+ "budgeting": "Budgeting & Planning",
1325
+ "credit": "Credit & Collections",
1326
+ "hr": "HR & Payroll",
1327
+ "operations": "Operations & Segments",
1328
+ "risk": "Risk & Insurance",
1329
+ "general": "General",
1330
+ }
1331
+
1332
+
1333
+ # ============================================================================
1334
+ # LEARNED KEYWORDS LOADER
1335
+ # ============================================================================
1336
+
1337
+ _LEARNED_KEYWORDS_PATH = os.path.join(
1338
+ os.path.dirname(__file__), "learned_keywords.json"
1339
+ )
1340
+
1341
+
1342
+ def load_learned_keywords() -> Dict[str, List[str]]:
1343
+ """Load admin-approved learned keywords from JSON file."""
1344
+ if not os.path.exists(_LEARNED_KEYWORDS_PATH):
1345
+ return {}
1346
+ try:
1347
+ with open(_LEARNED_KEYWORDS_PATH, "r") as f:
1348
+ return json.load(f)
1349
+ except (json.JSONDecodeError, IOError) as e:
1350
+ logger.warning(f"Failed to load learned keywords: {e}")
1351
+ return {}
1352
+
1353
+
1354
+ def save_learned_keywords(learned: Dict[str, List[str]]) -> None:
1355
+ """Persist learned keywords to JSON file."""
1356
+ try:
1357
+ with open(_LEARNED_KEYWORDS_PATH, "w") as f:
1358
+ json.dump(learned, f, indent=2)
1359
+ logger.info(f"Saved learned keywords to {_LEARNED_KEYWORDS_PATH}")
1360
+ except IOError as e:
1361
+ logger.error(f"Failed to save learned keywords: {e}")
1362
+
1363
+
1364
+ def get_effective_keywords(doc_type_key: str) -> List[str]:
1365
+ """
1366
+ Get the full keyword list for a doc type, merging base + learned.
1367
+
1368
+ Returns:
1369
+ Combined list of base keywords + any learned keywords for this type.
1370
+ """
1371
+ base = DOC_TYPE_REGISTRY.get(doc_type_key)
1372
+ if not base:
1373
+ return []
1374
+
1375
+ keywords = list(base.keywords)
1376
+
1377
+ # Merge learned keywords
1378
+ learned = load_learned_keywords()
1379
+ if doc_type_key in learned:
1380
+ for kw in learned[doc_type_key]:
1381
+ if kw not in keywords:
1382
+ keywords.append(kw)
1383
+
1384
+ return keywords
1385
+
1386
+
1387
+ def get_all_doc_types_summary() -> List[Dict]:
1388
+ """
1389
+ Get a summary list of all doc types for the frontend.
1390
+
1391
+ Returns:
1392
+ List of dicts with id, key, display_name, category, keyword_count.
1393
+ """
1394
+ learned = load_learned_keywords()
1395
+ result = []
1396
+ for key, dt in DOC_TYPE_REGISTRY.items():
1397
+ learned_count = len(learned.get(key, []))
1398
+ result.append({
1399
+ "id": dt.id,
1400
+ "key": key,
1401
+ "display_name": dt.display_name,
1402
+ "category": dt.category,
1403
+ "category_name": CATEGORY_NAMES.get(dt.category, dt.category),
1404
+ "keyword_count": len(dt.keywords) + learned_count,
1405
+ "field_count": len(dt.fields),
1406
+ "learned_keyword_count": learned_count,
1407
+ })
1408
+ return sorted(result, key=lambda x: x["id"])
app/services/ingestion/dolphin/__init__.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Dolphin PDF Extraction Module — Hybrid Architecture.
3
+
4
+ Uses ByteDance Dolphin-v2 for advanced document layout analysis,
5
+ classification, and element extraction, combined with pdfplumber
6
+ for gap-filling and validation.
7
+
8
+ ## Quick Check
9
+
10
+ ```python
11
+ from app.services.ingestion.dolphin import is_dolphin_available, ensure_model_downloaded
12
+
13
+ if is_dolphin_available():
14
+ from app.services.ingestion.dolphin.client import DolphinClient
15
+ client = DolphinClient()
16
+ ```
17
+ """
18
+
19
+ import os
20
+ import logging
21
+ from typing import Optional
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ # Default model storage location (relative to backend root)
26
+ DEFAULT_MODEL_DIR = os.path.join(
27
+ os.path.dirname(os.path.abspath(__file__)),
28
+ "..", "..", "..", "..", "models", "dolphin-v2"
29
+ )
30
+
31
+ _dolphin_available: Optional[bool] = None
32
+
33
+
34
+ def _detect_device() -> str:
35
+ """Auto-detect best available compute device: cuda > mps > cpu."""
36
+ try:
37
+ import torch
38
+ if torch.cuda.is_available():
39
+ logger.info("Dolphin device: CUDA GPU detected")
40
+ return "cuda"
41
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
42
+ logger.info("Dolphin device: Apple MPS (Metal) detected")
43
+ return "mps"
44
+ except ImportError:
45
+ pass
46
+ logger.info("Dolphin device: CPU mode")
47
+ return "cpu"
48
+
49
+
50
+ def _get_model_path() -> str:
51
+ """Resolve model path from config or default."""
52
+ try:
53
+ from app.core.config import settings
54
+ if settings.DOLPHIN_MODEL_PATH:
55
+ return settings.DOLPHIN_MODEL_PATH
56
+ except Exception:
57
+ pass
58
+ return os.path.abspath(DEFAULT_MODEL_DIR)
59
+
60
+
61
+ def is_dolphin_available() -> bool:
62
+ """
63
+ Check if Dolphin model and dependencies are installed.
64
+ Result is cached after first check.
65
+ """
66
+ global _dolphin_available
67
+ if _dolphin_available is not None:
68
+ return _dolphin_available
69
+
70
+ # If remote API is configured, we consider Dolphin available
71
+ # (The remote worker manages the model)
72
+ from app.core.config import settings
73
+ if settings.DOLPHIN_API_URL:
74
+ _dolphin_available = True
75
+ return True
76
+
77
+ try:
78
+ import torch # noqa: F401
79
+ import transformers # noqa: F401
80
+ from PIL import Image # noqa: F401
81
+
82
+ model_path = _get_model_path()
83
+ if os.path.isdir(model_path):
84
+ # Check for key model files
85
+ has_config = os.path.exists(os.path.join(model_path, "config.json"))
86
+ has_weights = (
87
+ os.path.exists(os.path.join(model_path, "model.safetensors"))
88
+ or os.path.exists(os.path.join(model_path, "pytorch_model.bin"))
89
+ or any(f.startswith("model-") for f in os.listdir(model_path) if f.endswith(".safetensors"))
90
+ )
91
+ _dolphin_available = has_config and has_weights
92
+ else:
93
+ _dolphin_available = False
94
+
95
+ except ImportError as e:
96
+ logger.debug(f"Dolphin dependencies not installed: {e}")
97
+ _dolphin_available = False
98
+
99
+ logger.info(f"Dolphin availability: {_dolphin_available}")
100
+ return _dolphin_available
101
+
102
+
103
+ def ensure_model_downloaded(force: bool = False) -> str:
104
+ """
105
+ Download Dolphin-v2 model from HuggingFace if not already present.
106
+
107
+ Args:
108
+ force: If True, re-download even if model exists
109
+
110
+ Returns:
111
+ Path to the downloaded model directory
112
+ """
113
+ model_path = _get_model_path()
114
+
115
+ if not force and os.path.isdir(model_path):
116
+ config_path = os.path.join(model_path, "config.json")
117
+ if os.path.exists(config_path):
118
+ logger.info(f"Dolphin model already present at {model_path}")
119
+ return model_path
120
+
121
+ logger.info("Downloading Dolphin-v2 model from HuggingFace...")
122
+
123
+ try:
124
+ from huggingface_hub import snapshot_download
125
+
126
+ os.makedirs(model_path, exist_ok=True)
127
+ snapshot_download(
128
+ repo_id="ByteDance/Dolphin-v2",
129
+ local_dir=model_path,
130
+ local_dir_use_symlinks=False,
131
+ )
132
+ logger.info(f"Dolphin-v2 model downloaded to {model_path}")
133
+
134
+ # Invalidate cache so next check picks up the new model
135
+ global _dolphin_available
136
+ _dolphin_available = None
137
+
138
+ return model_path
139
+
140
+ except Exception as e:
141
+ logger.error(f"Failed to download Dolphin model: {e}")
142
+ raise RuntimeError(
143
+ f"Dolphin model download failed: {e}. "
144
+ "Install huggingface-hub and ensure network access, "
145
+ "or manually download to: {model_path}"
146
+ ) from e
147
+
148
+
149
+ def get_device() -> str:
150
+ """Get configured or auto-detected device."""
151
+ try:
152
+ from app.core.config import settings
153
+ device = getattr(settings, "DOLPHIN_DEVICE", "auto")
154
+ if device != "auto":
155
+ return device
156
+ except Exception:
157
+ pass
158
+ return _detect_device()
app/services/ingestion/dolphin/classifier.py ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Document Classifier — 53-Type Keyword Classification System
3
+ ============================================================
4
+ Identifies financial document types from parsed content using keyword
5
+ matching against the doc_keywords registry. Uses an 80% keyword match
6
+ threshold for high-confidence classification with a 3-tier fallback.
7
+
8
+ Tiers:
9
+ ≥80% → High confidence (classified)
10
+ 50-79% → Low confidence (classified, flagged needs_review)
11
+ <50% → No match (general_financial fallback)
12
+ """
13
+
14
+ import re
15
+ import logging
16
+ from typing import List, Dict, Tuple, Optional
17
+ from dataclasses import dataclass, field
18
+
19
+ from ..doc_keywords import (
20
+ DOC_TYPE_REGISTRY,
21
+ get_effective_keywords,
22
+ get_all_doc_types_summary,
23
+ )
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ # ============================================================================
29
+ # CLASSIFICATION THRESHOLDS
30
+ # ============================================================================
31
+ HIGH_CONFIDENCE_THRESHOLD = 0.80 # ≥80% → classified
32
+ LOW_CONFIDENCE_THRESHOLD = 0.50 # 50-79% → flagged needs_review
33
+
34
+
35
+ @dataclass
36
+ class DocumentClassification:
37
+ """Classification result for a parsed document."""
38
+ doc_type: str # Internal key (e.g. "rent_roll")
39
+ doc_type_display: str = "" # Human name (e.g. "Rent Roll")
40
+ confidence: float = 0.0 # 0.0 - 1.0
41
+ match_percentage: float = 0.0 # % of keywords matched (0-100)
42
+ needs_review: bool = False # True if 50-79% match
43
+ matched_keywords: List[str] = field(default_factory=list)
44
+ extractable_fields: List[str] = field(default_factory=list)
45
+ detected_sections: List[str] = field(default_factory=list)
46
+ extraction_method: str = "dolphin_hybrid"
47
+ secondary_types: List[str] = field(default_factory=list)
48
+ category: str = "general"
49
+
50
+
51
+ class DocumentClassifier:
52
+ """
53
+ Classifies financial documents using 53-type keyword registry.
54
+
55
+ Scans combined Dolphin + pdfplumber text against all registered doc types,
56
+ counts keyword hits, and applies the 80% match threshold.
57
+
58
+ Usage:
59
+ classifier = DocumentClassifier()
60
+ result = classifier.classify(combined_text)
61
+ """
62
+
63
+ @staticmethod
64
+ def classify(
65
+ text_content: str,
66
+ dolphin_sections: Optional[List[Dict]] = None,
67
+ dolphin_elements: Optional[list] = None,
68
+ ) -> DocumentClassification:
69
+ """
70
+ Classify the document based on combined extracted text.
71
+
72
+ Args:
73
+ text_content: Full text extracted from BOTH engines (Dolphin + pdfplumber)
74
+ dolphin_sections: Layout sections from Dolphin (if available)
75
+ dolphin_elements: Parsed elements from Dolphin (if available)
76
+
77
+ Returns:
78
+ DocumentClassification with type, confidence, matched keywords, and fields
79
+ """
80
+ if not text_content:
81
+ return DocumentClassification(
82
+ doc_type="general_financial",
83
+ doc_type_display="General Financial",
84
+ confidence=0.0,
85
+ extraction_method="dolphin_hybrid",
86
+ )
87
+
88
+ text_lower = text_content.lower()
89
+
90
+ # ── Score every registered doc type ──────────────────────────
91
+ scores: Dict[str, Dict] = {}
92
+
93
+ for doc_key, doc_def in DOC_TYPE_REGISTRY.items():
94
+ # Get effective keywords (base + learned)
95
+ all_keywords = get_effective_keywords(doc_key)
96
+ if not all_keywords:
97
+ continue
98
+
99
+ matched = []
100
+ for kw in all_keywords:
101
+ if kw.lower() in text_lower:
102
+ matched.append(kw)
103
+
104
+ total = len(all_keywords)
105
+ pct = (len(matched) / total * 100) if total > 0 else 0.0
106
+
107
+ scores[doc_key] = {
108
+ "matched": matched,
109
+ "total": total,
110
+ "percentage": pct,
111
+ "fields": doc_def.fields,
112
+ "display_name": doc_def.display_name,
113
+ "category": doc_def.category,
114
+ }
115
+
116
+ # ── Find top match ───────────────────────────────────────────
117
+ if not scores:
118
+ return DocumentClassification(
119
+ doc_type="general_financial",
120
+ doc_type_display="General Financial",
121
+ confidence=0.1,
122
+ extraction_method="dolphin_hybrid",
123
+ )
124
+
125
+ best_key = max(scores, key=lambda k: scores[k]["percentage"])
126
+ best = scores[best_key]
127
+ best_pct = best["percentage"]
128
+
129
+ # ── Apply 3-tier threshold ───────────────────────────────────
130
+ if best_pct >= HIGH_CONFIDENCE_THRESHOLD * 100:
131
+ # Tier 1: High confidence
132
+ confidence = min(best_pct / 100.0, 1.0)
133
+ needs_review = False
134
+ doc_type = best_key
135
+ elif best_pct >= LOW_CONFIDENCE_THRESHOLD * 100:
136
+ # Tier 2: Low confidence — classify but flag
137
+ confidence = best_pct / 100.0
138
+ needs_review = True
139
+ doc_type = best_key
140
+ else:
141
+ # Tier 3: No match — fallback
142
+ doc_type = "general_financial"
143
+ confidence = max(best_pct / 100.0, 0.1)
144
+ needs_review = False
145
+
146
+ logger.info(
147
+ f"No doc type matched at ≥50%. Best: {best_key} "
148
+ f"({best_pct:.1f}%). Falling back to general_financial."
149
+ )
150
+
151
+ return DocumentClassification(
152
+ doc_type="general_financial",
153
+ doc_type_display="General Financial",
154
+ confidence=round(confidence, 3),
155
+ match_percentage=round(best_pct, 1),
156
+ needs_review=False,
157
+ matched_keywords=[],
158
+ extractable_fields=[],
159
+ extraction_method="dolphin_hybrid",
160
+ secondary_types=[],
161
+ category="general",
162
+ )
163
+
164
+ # ── Gather secondary types (other types with decent matches) ─
165
+ secondary = [
166
+ k for k, v in scores.items()
167
+ if v["percentage"] >= 30.0 and k != doc_type
168
+ ]
169
+ # Sort secondaries by match percentage descending
170
+ secondary.sort(key=lambda k: scores[k]["percentage"], reverse=True)
171
+
172
+ logger.info(
173
+ f"Classified as '{doc_type}' ({best['display_name']}) "
174
+ f"with {best_pct:.1f}% keyword match "
175
+ f"({len(best['matched'])}/{best['total']} keywords). "
176
+ f"needs_review={needs_review}"
177
+ )
178
+
179
+ return DocumentClassification(
180
+ doc_type=doc_type,
181
+ doc_type_display=best["display_name"],
182
+ confidence=round(confidence, 3),
183
+ match_percentage=round(best_pct, 1),
184
+ needs_review=needs_review,
185
+ matched_keywords=best["matched"],
186
+ extractable_fields=best["fields"],
187
+ detected_sections=[], # Populated below if dolphin sections available
188
+ extraction_method="dolphin_hybrid",
189
+ secondary_types=secondary[:5], # Top 5 secondary matches
190
+ category=best["category"],
191
+ )
192
+
193
+ @staticmethod
194
+ def get_financial_statement_types(classification: DocumentClassification) -> List[str]:
195
+ """
196
+ Return the list of financial statement types that should be
197
+ extracted from this document.
198
+
199
+ For core financial statements, returns the matching type.
200
+ For 10-K/10-Q and general_financial, returns all three.
201
+ For specialized doc types, returns relevant statement types
202
+ plus any secondaries detected.
203
+ """
204
+ # Comprehensive types always extract all three
205
+ comprehensive_types = {"10-K", "10-Q", "general_financial"}
206
+
207
+ if classification.doc_type in comprehensive_types:
208
+ return ["income", "balance", "cash_flow"]
209
+
210
+ # Core financial statement type mappings
211
+ type_map = {
212
+ "income_statement": ["income"],
213
+ "balance_sheet": ["balance"],
214
+ "cash_flow_statement": ["cash_flow"],
215
+ "bank_statement": ["cash_flow"],
216
+ "invoice": ["income"],
217
+ "tax_return": ["income"],
218
+ # Specialized types that primarily contain income-like data
219
+ "arr_mrr_waterfall": ["income"],
220
+ "deferred_revenue_schedule": ["income", "balance"],
221
+ "cac_ltv_model": ["income"],
222
+ "noi_statement": ["income"],
223
+ "cogm": ["income"],
224
+ "production_variance": ["income"],
225
+ "departmental_pl": ["income"],
226
+ "daily_revenue": ["income"],
227
+ "budget_vs_actuals": ["income"],
228
+ # Balance-sheet focused
229
+ "rent_roll": ["income", "balance"],
230
+ "asc_842_lease": ["balance"],
231
+ "fixed_asset_rollforward": ["balance"],
232
+ "wip_valuation": ["balance"],
233
+ "aged_ar": ["balance"],
234
+ "aged_ap": ["balance"],
235
+ "debt_covenant": ["balance"],
236
+ "aro_schedule": ["balance"],
237
+ "reserve_report": ["balance"],
238
+ # Cash flow focused
239
+ "thirteen_week_cash_flow": ["cash_flow"],
240
+ "capex_reserve": ["cash_flow", "balance"],
241
+ "capex_budget": ["cash_flow"],
242
+ # Equity
243
+ "shareholders_equity": ["balance"],
244
+ }
245
+
246
+ base = type_map.get(
247
+ classification.doc_type,
248
+ ["income", "balance", "cash_flow"], # Default: extract all
249
+ )
250
+
251
+ # Add secondary types
252
+ for sec_type in classification.secondary_types:
253
+ extra = type_map.get(sec_type, [])
254
+ for e in extra:
255
+ if e not in base:
256
+ base.append(e)
257
+
258
+ return base
259
+
260
+ @staticmethod
261
+ def classify_with_details(text_content: str) -> Dict:
262
+ """
263
+ Classify and return a full details dict for API responses.
264
+
265
+ Returns a JSON-serializable dict with all classification details.
266
+ """
267
+ result = DocumentClassifier.classify(text_content)
268
+ return {
269
+ "doc_type": result.doc_type,
270
+ "doc_type_display": result.doc_type_display,
271
+ "confidence": result.confidence,
272
+ "match_percentage": result.match_percentage,
273
+ "needs_review": result.needs_review,
274
+ "matched_keywords": result.matched_keywords,
275
+ "extractable_fields": result.extractable_fields,
276
+ "secondary_types": result.secondary_types,
277
+ "category": result.category,
278
+ }
app/services/ingestion/dolphin/client.py ADDED
@@ -0,0 +1,393 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Dolphin Client — Wraps the ByteDance Dolphin-v2 model for document parsing.
3
+
4
+ Provides page-level, element-level, and layout parsing capabilities
5
+ with automatic device selection (CUDA > MPS > CPU).
6
+ """
7
+
8
+ import os
9
+ import logging
10
+ from typing import List, Dict, Any, Optional
11
+ from dataclasses import dataclass, field
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ # ---------------------------------------------------------------------------
17
+ # Data classes for Dolphin outputs
18
+ # ---------------------------------------------------------------------------
19
+
20
+ @dataclass
21
+ class DolphinElement:
22
+ """A single parsed element from a document page."""
23
+ element_type: str # "text", "table", "formula", "figure", "code"
24
+ content: str # Markdown or plain text content
25
+ bbox: Optional[List[float]] = None # [x1, y1, x2, y2] bounding box
26
+ confidence: float = 1.0
27
+ page_number: int = 0
28
+ metadata: Dict[str, Any] = field(default_factory=dict)
29
+
30
+
31
+ @dataclass
32
+ class DolphinPageResult:
33
+ """Result from page-level parsing."""
34
+ page_number: int
35
+ markdown: str # Full page rendered as Markdown
36
+ structured_json: Dict[str, Any] = field(default_factory=dict)
37
+ elements: List[DolphinElement] = field(default_factory=list)
38
+
39
+
40
+ @dataclass
41
+ class DolphinLayoutResult:
42
+ """Result from layout analysis."""
43
+ page_number: int
44
+ sections: List[Dict[str, Any]] = field(default_factory=list) # [{type, bbox, label}]
45
+ reading_order: List[int] = field(default_factory=list) # Element indices in reading order
46
+ doc_type_hint: str = "unknown" # "digital" or "photographed"
47
+
48
+
49
+ @dataclass
50
+ class DolphinDocumentResult:
51
+ """Aggregated result for an entire PDF document."""
52
+ pages: List[DolphinPageResult] = field(default_factory=list)
53
+ layouts: List[DolphinLayoutResult] = field(default_factory=list)
54
+ full_markdown: str = ""
55
+ total_pages: int = 0
56
+
57
+
58
+ class DolphinClient:
59
+ """
60
+ High-level client for Dolphin-v2 document parsing.
61
+
62
+ Acts as a factory: returns either a local model wrapper (if no API URL)
63
+ or a remote client (if API URL is configured).
64
+ """
65
+
66
+ @staticmethod
67
+ def create():
68
+ """
69
+ Factory method to create the appropriate Dolphin client.
70
+
71
+ Returns:
72
+ RemoteDolphinClient if DOLPHIN_API_URL is set
73
+ LocalDolphinClient (self) otherwise
74
+ """
75
+ from app.core.config import settings
76
+
77
+ if settings.DOLPHIN_API_URL:
78
+ from app.services.ingestion.dolphin.remote_client import RemoteDolphinClient
79
+ return RemoteDolphinClient()
80
+
81
+ return DolphinClient()
82
+
83
+ def __init__(
84
+ self,
85
+ model_path: Optional[str] = None,
86
+ device: Optional[str] = None,
87
+ max_batch_size: int = 4,
88
+ ):
89
+ from app.services.ingestion.dolphin import _get_model_path, get_device
90
+
91
+ self.model_path = model_path or _get_model_path()
92
+ self.device = device or get_device()
93
+ self.max_batch_size = max_batch_size
94
+ self._model = None
95
+ self._processor = None
96
+
97
+ logger.info(
98
+ f"DolphinClient initialized: model={self.model_path}, device={self.device}"
99
+ )
100
+
101
+ # ------------------------------------------------------------------
102
+ # Lazy model loading
103
+ # ------------------------------------------------------------------
104
+
105
+ def _ensure_loaded(self):
106
+ """Lazy-load model and processor on first use."""
107
+ if self._model is not None:
108
+ return
109
+
110
+ try:
111
+ import torch
112
+ from transformers import AutoModelForVision2Seq, AutoProcessor
113
+
114
+ logger.info(f"Loading Dolphin-v2 model from {self.model_path}...")
115
+
116
+ self._processor = AutoProcessor.from_pretrained(
117
+ self.model_path, trust_remote_code=True
118
+ )
119
+ self._model = AutoModelForVision2Seq.from_pretrained(
120
+ self.model_path,
121
+ trust_remote_code=True,
122
+ # CRITICAL: CPU does not support float16 — force float32 on CPU
123
+ torch_dtype=torch.float32 if self.device == "cpu" else torch.float16,
124
+ )
125
+ self._model.to(self.device)
126
+ self._model.eval()
127
+
128
+ logger.info("Dolphin-v2 model loaded successfully")
129
+
130
+ except Exception as e:
131
+ logger.error(f"Failed to load Dolphin model: {e}")
132
+ raise RuntimeError(f"Dolphin model loading failed: {e}") from e
133
+
134
+ # ------------------------------------------------------------------
135
+ # PDF → Images conversion
136
+ # ------------------------------------------------------------------
137
+
138
+ @staticmethod
139
+ def _pdf_to_images(pdf_path: str) -> list:
140
+ """Convert PDF pages to PIL Images for Dolphin processing."""
141
+ try:
142
+ from pdf2image import convert_from_path
143
+ images = convert_from_path(pdf_path, dpi=200)
144
+ return images
145
+ except ImportError:
146
+ # Fallback: use pypdf + Pillow for basic conversion
147
+ logger.warning("pdf2image not installed, using fallback renderer")
148
+ return DolphinClient._pdf_to_images_fallback(pdf_path)
149
+
150
+ @staticmethod
151
+ def _pdf_to_images_fallback(pdf_path: str) -> list:
152
+ """Fallback PDF → image conversion using pypdf."""
153
+ from PIL import Image
154
+ import io
155
+
156
+ try:
157
+ from pypdf import PdfReader
158
+ reader = PdfReader(pdf_path)
159
+ images = []
160
+ for page in reader.pages:
161
+ # Extract any embedded images from the page
162
+ for img_key in page.images:
163
+ img_data = img_key.data
164
+ img = Image.open(io.BytesIO(img_data))
165
+ images.append(img)
166
+ break # One image per page is enough
167
+ if not images:
168
+ # Create a blank placeholder if no images could be extracted
169
+ logger.warning("No images extracted from PDF pages, layout analysis may be limited")
170
+ for _ in reader.pages:
171
+ img = Image.new("RGB", (1700, 2200), "white")
172
+ images.append(img)
173
+ return images
174
+ except Exception as e:
175
+ logger.error(f"Fallback PDF image conversion failed: {e}")
176
+ return []
177
+
178
+ # ------------------------------------------------------------------
179
+ # Core parsing methods
180
+ # ------------------------------------------------------------------
181
+
182
+ def parse_page(self, image, page_number: int = 0) -> DolphinPageResult:
183
+ """
184
+ Parse a single page image into structured output.
185
+
186
+ Args:
187
+ image: PIL Image of the page
188
+ page_number: Page index (0-based)
189
+
190
+ Returns:
191
+ DolphinPageResult with markdown and structured elements
192
+ """
193
+ self._ensure_loaded()
194
+
195
+ try:
196
+ import torch
197
+
198
+ # Prepare input with page-level prompt
199
+ prompt = "<page_parsing>"
200
+ inputs = self._processor(
201
+ images=image, text=prompt, return_tensors="pt"
202
+ ).to(self.device)
203
+
204
+ with torch.no_grad():
205
+ outputs = self._model.generate(
206
+ **inputs,
207
+ max_new_tokens=4096,
208
+ do_sample=False,
209
+ )
210
+
211
+ result_text = self._processor.batch_decode(
212
+ outputs, skip_special_tokens=True
213
+ )[0]
214
+
215
+ # Parse elements from the result
216
+ elements = self._parse_elements_from_text(result_text, page_number)
217
+
218
+ return DolphinPageResult(
219
+ page_number=page_number,
220
+ markdown=result_text,
221
+ structured_json={"raw_output": result_text},
222
+ elements=elements,
223
+ )
224
+
225
+ except Exception as e:
226
+ logger.error(f"Dolphin page parsing failed for page {page_number}: {e}")
227
+ return DolphinPageResult(
228
+ page_number=page_number,
229
+ markdown="",
230
+ elements=[],
231
+ )
232
+
233
+ def parse_layout(self, image, page_number: int = 0) -> DolphinLayoutResult:
234
+ """
235
+ Analyze layout/structure of a page image.
236
+
237
+ Returns section bounding boxes, reading order, and document type hint.
238
+ """
239
+ self._ensure_loaded()
240
+
241
+ try:
242
+ import torch
243
+
244
+ prompt = "<layout_parsing>"
245
+ inputs = self._processor(
246
+ images=image, text=prompt, return_tensors="pt"
247
+ ).to(self.device)
248
+
249
+ with torch.no_grad():
250
+ outputs = self._model.generate(
251
+ **inputs,
252
+ max_new_tokens=2048,
253
+ do_sample=False,
254
+ )
255
+
256
+ result_text = self._processor.batch_decode(
257
+ outputs, skip_special_tokens=True
258
+ )[0]
259
+
260
+ sections = self._parse_layout_sections(result_text)
261
+ doc_type_hint = "digital" # Dolphin detects this in stage 1
262
+
263
+ return DolphinLayoutResult(
264
+ page_number=page_number,
265
+ sections=sections,
266
+ reading_order=list(range(len(sections))),
267
+ doc_type_hint=doc_type_hint,
268
+ )
269
+
270
+ except Exception as e:
271
+ logger.error(f"Dolphin layout parsing failed for page {page_number}: {e}")
272
+ return DolphinLayoutResult(page_number=page_number)
273
+
274
+ def parse_document(self, pdf_path: str) -> DolphinDocumentResult:
275
+ """
276
+ Parse an entire PDF document — page-level + layout for all pages.
277
+
278
+ This is the main entry point for the hybrid parser.
279
+
280
+ Args:
281
+ pdf_path: Path to the PDF file
282
+
283
+ Returns:
284
+ DolphinDocumentResult with all pages parsed
285
+ """
286
+ images = self._pdf_to_images(pdf_path)
287
+ if not images:
288
+ logger.warning(f"No page images extracted from {pdf_path}")
289
+ return DolphinDocumentResult(total_pages=0)
290
+
291
+ pages = []
292
+ layouts = []
293
+ all_markdown = []
294
+
295
+ for i, image in enumerate(images):
296
+ logger.debug(f"Parsing page {i + 1}/{len(images)}")
297
+
298
+ # Page-level parsing (structured content)
299
+ page_result = self.parse_page(image, page_number=i)
300
+ pages.append(page_result)
301
+ all_markdown.append(page_result.markdown)
302
+
303
+ # Layout analysis (structure detection)
304
+ layout_result = self.parse_layout(image, page_number=i)
305
+ layouts.append(layout_result)
306
+
307
+ return DolphinDocumentResult(
308
+ pages=pages,
309
+ layouts=layouts,
310
+ full_markdown="\n\n---\n\n".join(all_markdown),
311
+ total_pages=len(images),
312
+ )
313
+
314
+ # ------------------------------------------------------------------
315
+ # Internal helpers
316
+ # ------------------------------------------------------------------
317
+
318
+ @staticmethod
319
+ def _parse_elements_from_text(text: str, page_number: int) -> List[DolphinElement]:
320
+ """Parse Dolphin's text output into structured DolphinElement objects."""
321
+ elements = []
322
+ if not text:
323
+ return elements
324
+
325
+ import re
326
+
327
+ # Split by Markdown table blocks
328
+ table_pattern = re.compile(r"(\|.+\|(?:\n\|.+\|)*)", re.MULTILINE)
329
+
330
+ last_end = 0
331
+ for match in table_pattern.finditer(text):
332
+ # Text before table
333
+ pre_text = text[last_end:match.start()].strip()
334
+ if pre_text:
335
+ elements.append(DolphinElement(
336
+ element_type="text",
337
+ content=pre_text,
338
+ page_number=page_number,
339
+ ))
340
+
341
+ # Table element
342
+ elements.append(DolphinElement(
343
+ element_type="table",
344
+ content=match.group(0),
345
+ page_number=page_number,
346
+ ))
347
+ last_end = match.end()
348
+
349
+ # Remaining text after last table
350
+ remaining = text[last_end:].strip()
351
+ if remaining:
352
+ elements.append(DolphinElement(
353
+ element_type="text",
354
+ content=remaining,
355
+ page_number=page_number,
356
+ ))
357
+
358
+ return elements
359
+
360
+ @staticmethod
361
+ def _parse_layout_sections(text: str) -> List[Dict[str, Any]]:
362
+ """Parse Dolphin layout output into section descriptors."""
363
+ sections = []
364
+ if not text:
365
+ return sections
366
+
367
+ import re
368
+
369
+ # Dolphin layout output typically contains bounding box coordinates
370
+ # Pattern: <section_type> [x1, y1, x2, y2]
371
+ bbox_pattern = re.compile(
372
+ r"(\w+[\w\s]*?)\s*\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]"
373
+ )
374
+
375
+ for match in bbox_pattern.finditer(text):
376
+ sections.append({
377
+ "type": match.group(1).strip(),
378
+ "bbox": [
379
+ int(match.group(2)),
380
+ int(match.group(3)),
381
+ int(match.group(4)),
382
+ int(match.group(5)),
383
+ ],
384
+ })
385
+
386
+ # If no bbox patterns found, treat each line as a section label
387
+ if not sections:
388
+ for line in text.strip().split("\n"):
389
+ line = line.strip()
390
+ if line:
391
+ sections.append({"type": line, "bbox": []})
392
+
393
+ return sections
app/services/ingestion/dolphin/extractor.py ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Dolphin Extractor — Extracts structured financial data from Dolphin's parsed output.
3
+
4
+ Converts Dolphin's Markdown/JSON tables and text elements into
5
+ key-value financial data using the existing DataMapper.
6
+ """
7
+
8
+ import re
9
+ import logging
10
+ from typing import Dict, List, Any, Optional
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class DolphinExtractor:
16
+ """
17
+ Extracts financial data from Dolphin's parsed output.
18
+
19
+ Works with DolphinPageResult and DolphinElement objects to produce
20
+ a flat dict of {field_name: value} pairs ready for FinancialReport
21
+ construction.
22
+
23
+ Usage:
24
+ extractor = DolphinExtractor()
25
+ data = extractor.extract(dolphin_result, doc_classification)
26
+ """
27
+
28
+ @staticmethod
29
+ def extract(
30
+ dolphin_result, # DolphinDocumentResult
31
+ doc_classification=None, # DocumentClassification
32
+ ) -> Dict[str, Any]:
33
+ """
34
+ Extract all financial data from a Dolphin document result.
35
+
36
+ Args:
37
+ dolphin_result: DolphinDocumentResult from client.parse_document()
38
+ doc_classification: Optional classification to guide extraction
39
+
40
+ Returns:
41
+ Dict of {standardized_field_name: float_value}
42
+ """
43
+ from app.services.ingestion.mappings import DataMapper
44
+
45
+ extracted = {}
46
+ tables_data = []
47
+ text_content_parts = []
48
+
49
+ for page in dolphin_result.pages:
50
+ for element in page.elements:
51
+ if element.element_type == "table":
52
+ table_rows = DolphinExtractor._parse_markdown_table(
53
+ element.content
54
+ )
55
+ tables_data.append(table_rows)
56
+ elif element.element_type == "text":
57
+ text_content_parts.append(element.content)
58
+
59
+ # --- Strategy 1: Table Extraction ---
60
+ for table_rows in tables_data:
61
+ table_data = DolphinExtractor._extract_from_table_rows(
62
+ table_rows, DataMapper
63
+ )
64
+ # Only overwrite if we haven't seen this field yet
65
+ for k, v in table_data.items():
66
+ if k not in extracted:
67
+ extracted[k] = v
68
+
69
+ # --- Strategy 2: Text/Regex Extraction from Dolphin output ---
70
+ full_text = "\n".join(text_content_parts)
71
+ if full_text:
72
+ text_data = DolphinExtractor._extract_from_text(full_text, DataMapper)
73
+ for k, v in text_data.items():
74
+ if k not in extracted:
75
+ extracted[k] = v
76
+
77
+ # --- Strategy 3: Full Markdown extraction (catch-all) ---
78
+ if dolphin_result.full_markdown:
79
+ markdown_data = DolphinExtractor._extract_from_text(
80
+ dolphin_result.full_markdown, DataMapper
81
+ )
82
+ for k, v in markdown_data.items():
83
+ if k not in extracted:
84
+ extracted[k] = v
85
+
86
+ logger.info(
87
+ f"Dolphin extracted {len(extracted)} fields from "
88
+ f"{len(tables_data)} tables and {len(text_content_parts)} text blocks"
89
+ )
90
+
91
+ return extracted
92
+
93
+ @staticmethod
94
+ def extract_company_name(dolphin_result) -> Optional[str]:
95
+ """
96
+ Attempt to extract company name from Dolphin's parsed output.
97
+
98
+ Looks for SEC filing patterns, document headers, and prominent text.
99
+ """
100
+ if not dolphin_result.pages:
101
+ return None
102
+
103
+ # Check first page(s) for company name patterns
104
+ for page in dolphin_result.pages[:2]:
105
+ markdown = page.markdown
106
+ if not markdown:
107
+ continue
108
+
109
+ # SEC Filing: "Exact name of registrant as specified in its charter"
110
+ registrant_match = re.search(
111
+ r"(?:exact\s+name\s+of\s+registrant|registrant)",
112
+ markdown,
113
+ re.IGNORECASE,
114
+ )
115
+ if registrant_match:
116
+ # Look for prominent text before this marker
117
+ lines = markdown[: registrant_match.start()].strip().split("\n")
118
+ for line in reversed(lines[-10:]):
119
+ candidate = line.strip().strip("#").strip("*").strip()
120
+ if (
121
+ len(candidate) > 2
122
+ and not _is_boilerplate(candidate)
123
+ and any(c.isalpha() for c in candidate)
124
+ ):
125
+ return candidate[:100]
126
+
127
+ # Markdown heading on first page
128
+ heading_match = re.search(r"^#+\s+(.+)$", markdown, re.MULTILINE)
129
+ if heading_match:
130
+ candidate = heading_match.group(1).strip()
131
+ if len(candidate) > 2 and not _is_boilerplate(candidate):
132
+ return candidate[:100]
133
+
134
+ # First non-trivial line
135
+ for line in markdown.split("\n")[:30]:
136
+ candidate = line.strip().strip("#").strip("*").strip()
137
+ if (
138
+ len(candidate) > 3
139
+ and not _is_boilerplate(candidate)
140
+ and any(c.isalpha() for c in candidate)
141
+ ):
142
+ return candidate[:100]
143
+
144
+ return None
145
+
146
+ @staticmethod
147
+ def extract_fiscal_year(dolphin_result) -> Optional[str]:
148
+ """Extract fiscal year/period from Dolphin output."""
149
+ if not dolphin_result.full_markdown:
150
+ return None
151
+
152
+ patterns = [
153
+ r"(?:YEAR|PERIOD|FISCAL\s+YEAR)\s+ENDED\s+([A-Z]+\s+\d{1,2},\s+\d{4})",
154
+ r"(?:for\s+the\s+year\s+ended)\s+([A-Z]+\s+\d{1,2},\s+\d{4})",
155
+ r"DECEMBER\s+31,\s+(\d{4})",
156
+ r"(\d{4})\s+(?:annual|fiscal)",
157
+ ]
158
+
159
+ text = dolphin_result.full_markdown[:5000]
160
+ for pattern in patterns:
161
+ match = re.search(pattern, text, re.IGNORECASE)
162
+ if match:
163
+ return match.group(1)
164
+
165
+ return None
166
+
167
+ # ------------------------------------------------------------------
168
+ # Internal helpers
169
+ # ------------------------------------------------------------------
170
+
171
+ @staticmethod
172
+ def _parse_markdown_table(table_text: str) -> List[List[str]]:
173
+ """
174
+ Parse a Markdown-format table into a list of rows.
175
+
176
+ Handles:
177
+ | Header1 | Header2 |
178
+ |---------|---------|
179
+ | val1 | val2 |
180
+ """
181
+ rows = []
182
+ for line in table_text.strip().split("\n"):
183
+ line = line.strip()
184
+ if not line.startswith("|"):
185
+ continue
186
+ # Skip separator rows (|---|---|)
187
+ if all(re.match(r"^[\s\-:]+$", c) for c in line.split("|") if c.strip()):
188
+ continue
189
+
190
+ cells = [cell.strip() for cell in line.split("|")]
191
+ # Remove empty first/last from leading/trailing pipes
192
+ cells = [c for c in cells if c != ""]
193
+ if cells:
194
+ rows.append(cells)
195
+
196
+ return rows
197
+
198
+ @staticmethod
199
+ def _extract_from_table_rows(
200
+ rows: List[List[str]], data_mapper
201
+ ) -> Dict[str, float]:
202
+ """
203
+ Extract financial data from parsed table rows using DataMapper.
204
+
205
+ Assumes first column is label, remaining columns are values.
206
+ Picks the most recent year column if years are detected in headers.
207
+ """
208
+ if not rows:
209
+ return {}
210
+
211
+ data = {}
212
+
213
+ # Detect target value column (most recent year)
214
+ target_col = _find_target_column(rows)
215
+
216
+ # Detect scale multiplier from header text
217
+ multiplier = 1.0
218
+ header_text = " ".join(" ".join(r) for r in rows[:3]).lower()
219
+ if re.search(r"in millions|amounts in millions", header_text):
220
+ multiplier = 1_000_000.0
221
+ elif re.search(r"in thousands|amounts in thousands|\(in 000s\)", header_text):
222
+ multiplier = 1_000.0
223
+
224
+ for row in rows:
225
+ if len(row) < 2:
226
+ continue
227
+
228
+ label = row[0]
229
+ mapped_field = data_mapper.map_row(label)
230
+ if not mapped_field:
231
+ continue
232
+
233
+ # Get value from target column or first numeric column
234
+ val = None
235
+ if target_col is not None and target_col < len(row):
236
+ val = _clean_financial_value(row[target_col])
237
+
238
+ if val is None:
239
+ for cell in row[1:]:
240
+ val = _clean_financial_value(cell)
241
+ if val is not None:
242
+ break
243
+
244
+ if val is not None:
245
+ data[mapped_field] = val * multiplier
246
+
247
+ return data
248
+
249
+ @staticmethod
250
+ def _extract_from_text(
251
+ text: str, data_mapper
252
+ ) -> Dict[str, float]:
253
+ """
254
+ Regex-based extraction from unstructured text.
255
+
256
+ Catches line items in formats like:
257
+ Revenue ............... $1,234,567
258
+ Net Income (456,789)
259
+ """
260
+ data = {}
261
+
262
+ for field, aliases in data_mapper.FIELD_MAPPING.items():
263
+ if field in data:
264
+ continue
265
+
266
+ for alias in aliases:
267
+ pattern = re.compile(
268
+ rf"{re.escape(alias)}[^0-9\-]*?(\(?[\d,]+\.?\d*\)?)",
269
+ re.IGNORECASE,
270
+ )
271
+ match = pattern.search(text)
272
+ if match:
273
+ val = _clean_financial_value(match.group(1))
274
+ if val is not None:
275
+ data[field] = val
276
+ break
277
+
278
+ return data
279
+
280
+
281
+ # ---------------------------------------------------------------------------
282
+ # Module-level utility functions
283
+ # ---------------------------------------------------------------------------
284
+
285
+ def _find_target_column(rows: List[List[str]]) -> Optional[int]:
286
+ """Find the column index containing the most recent year."""
287
+ max_year = 0
288
+ target_col = None
289
+
290
+ for row in rows[:5]: # Check headers
291
+ for idx, cell in enumerate(row):
292
+ cell_clean = cell.replace("$", "").strip()
293
+ if re.match(r"^\d{4}$", cell_clean):
294
+ year = int(cell_clean)
295
+ if 2000 < year < 2100 and year > max_year:
296
+ max_year = year
297
+ target_col = idx
298
+
299
+ return target_col
300
+
301
+
302
+ def _clean_financial_value(val_str: Optional[str]) -> Optional[float]:
303
+ """Convert financial string formats to float."""
304
+ if not val_str:
305
+ return None
306
+
307
+ s = val_str.strip().replace("$", "").replace(",", "").replace(" ", "")
308
+ if not s:
309
+ return None
310
+
311
+ # Handle parentheses as negative: (123) → -123
312
+ if "(" in s and ")" in s:
313
+ s = s.replace("(", "-").replace(")", "")
314
+
315
+ # Handle em-dash or dash as zero
316
+ if s in ("-", "—", "–"):
317
+ return 0.0
318
+
319
+ try:
320
+ return float(s)
321
+ except ValueError:
322
+ return None
323
+
324
+
325
+ _BOILERPLATE_PHRASES = {
326
+ "table of contents", "contents", "index", "financial statements",
327
+ "consolidated financial statements", "annual report", "quarterly report",
328
+ "10-k", "10-q", "form 10-k", "form 10-q", "united states",
329
+ "securities and exchange commission", "washington", "d.c.",
330
+ "commission file number", "transition report",
331
+ }
332
+
333
+
334
+ def _is_boilerplate(text: str) -> bool:
335
+ """Check if text is a common boilerplate heading."""
336
+ return text.strip().lower() in _BOILERPLATE_PHRASES or text.strip().isdigit()
app/services/ingestion/dolphin/remote_client.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Remote Dolphin Client — Consumes the Dolphin-as-a-Service API.
3
+
4
+ Sends PDF files to the external AI Worker (Hugging Face Space)
5
+ and receives structured extraction results.
6
+ """
7
+
8
+ import os
9
+ import httpx
10
+ import logging
11
+ from typing import Optional, Dict, Any, List
12
+ from dataclasses import asdict
13
+
14
+ from app.core.config import settings
15
+ from app.services.ingestion.dolphin.client import (
16
+ DolphinDocumentResult,
17
+ DolphinPageResult,
18
+ DolphinLayoutResult,
19
+ DolphinElement,
20
+ )
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class RemoteDolphinClient:
26
+ """
27
+ Client for the remote Dolphin AI worker service.
28
+
29
+ Usage:
30
+ client = RemoteDolphinClient(api_url="https://hf.space/...", api_key="...")
31
+ result = client.parse_document("report.pdf")
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ api_url: Optional[str] = None,
37
+ api_key: Optional[str] = None,
38
+ timeout: int = 300, # 5 minutes for large PDFs
39
+ ):
40
+ self.api_url = (api_url or settings.DOLPHIN_API_URL).rstrip("/")
41
+ self.api_key = api_key or settings.DOLPHIN_API_KEY
42
+ self.timeout = timeout
43
+
44
+ if not self.api_url:
45
+ raise ValueError("DOLPHIN_API_URL must be set for RemoteDolphinClient")
46
+
47
+ logger.info(f"Initialized RemoteDolphinClient pointing to {self.api_url}")
48
+
49
+ def parse_document(self, pdf_path: str) -> DolphinDocumentResult:
50
+ """
51
+ Send PDF to remote worker and reconstruct the result object.
52
+ """
53
+ if not os.path.exists(pdf_path):
54
+ logger.error(f"PDF not found: {pdf_path}")
55
+ return DolphinDocumentResult(total_pages=0)
56
+
57
+ url = f"{self.api_url}/process"
58
+ headers = {}
59
+ if self.api_key:
60
+ headers["Authorization"] = f"Bearer {self.api_key}"
61
+
62
+ try:
63
+ logger.info(f"Sending {pdf_path} to remote Dolphin worker...")
64
+
65
+ with open(pdf_path, "rb") as f:
66
+ files = {"file": (os.path.basename(pdf_path), f, "application/pdf")}
67
+
68
+ with httpx.Client(timeout=self.timeout) as client:
69
+ response = client.post(url, files=files, headers=headers)
70
+ response.raise_for_status()
71
+
72
+ data = response.json()
73
+ return self._reconstruct_result(data)
74
+
75
+ except httpx.HTTPStatusError as e:
76
+ logger.error(f"Remote Dolphin API error: {e.response.text}")
77
+ raise RuntimeError(f"Dolphin API failed: {e.response.status_code}") from e
78
+ except Exception as e:
79
+ logger.error(f"Remote Dolphin client failed: {e}")
80
+ raise
81
+
82
+ def _reconstruct_result(self, data: Dict[str, Any]) -> DolphinDocumentResult:
83
+ """Convert JSON response back to DolphinDocumentResult objects."""
84
+ pages = []
85
+ for p in data.get("pages", []):
86
+ elements = [
87
+ DolphinElement(**e) for e in p.get("elements", [])
88
+ ]
89
+ pages.append(DolphinPageResult(
90
+ page_number=p["page_number"],
91
+ markdown=p["markdown"],
92
+ structured_json=p.get("structured_json", {}),
93
+ elements=elements,
94
+ ))
95
+
96
+ layouts = []
97
+ for l in data.get("layouts", []):
98
+ layouts.append(DolphinLayoutResult(
99
+ page_number=l["page_number"],
100
+ sections=l.get("sections", []),
101
+ reading_order=l.get("reading_order", []),
102
+ doc_type_hint=l.get("doc_type_hint", "unknown"),
103
+ ))
104
+
105
+ return DolphinDocumentResult(
106
+ pages=pages,
107
+ layouts=layouts,
108
+ full_markdown=data.get("full_markdown", ""),
109
+ total_pages=data.get("total_pages", 0),
110
+ )
app/services/ingestion/keyword_learner.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Keyword Learner — Self-improving classification from admin-uploaded reference documents.
3
+ ========================================================================================
4
+ Extracts candidate keywords from known-good reference documents, letting admins
5
+ review and approve them to grow the keyword registry over time.
6
+
7
+ Supports batch training with up to 5 files at once.
8
+ """
9
+
10
+ import re
11
+ import logging
12
+ import os
13
+ from collections import Counter
14
+ from typing import List, Dict, Optional, Tuple
15
+
16
+ from .doc_keywords import (
17
+ DOC_TYPE_REGISTRY,
18
+ get_effective_keywords,
19
+ load_learned_keywords,
20
+ save_learned_keywords,
21
+ )
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ # ============================================================================
26
+ # CONFIGURATION
27
+ # ============================================================================
28
+ MAX_TRAINING_FILES = 5
29
+ MIN_PHRASE_LENGTH = 3 # Minimum characters for a candidate keyword
30
+ MAX_PHRASE_LENGTH = 80 # Maximum characters for a candidate keyword
31
+ MIN_FREQUENCY = 1 # Minimum appearances across files to be a candidate
32
+ COMMON_STOPWORDS = {
33
+ "the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for",
34
+ "of", "with", "by", "from", "as", "is", "was", "are", "were", "be",
35
+ "been", "being", "have", "has", "had", "do", "does", "did", "will",
36
+ "would", "could", "should", "may", "might", "shall", "can",
37
+ "this", "that", "these", "those", "it", "its", "they", "them",
38
+ "their", "he", "she", "we", "you", "i", "me", "my", "your",
39
+ "page", "date", "total", "amount", "number", "name",
40
+ }
41
+
42
+
43
+ def extract_candidate_keywords(
44
+ texts: List[str],
45
+ doc_type_key: str,
46
+ ) -> List[Dict]:
47
+ """
48
+ Extract candidate keywords from reference document texts.
49
+
50
+ Analyzes the texts for distinctive phrases that are NOT already
51
+ in the registry for the given doc type.
52
+
53
+ Args:
54
+ texts: List of extracted text strings (one per uploaded file)
55
+ doc_type_key: The doc type key (e.g. "rent_roll")
56
+
57
+ Returns:
58
+ List of candidate keyword dicts with:
59
+ - keyword: the candidate text
60
+ - frequency: how many files it appeared in
61
+ - confidence: estimated relevance score (0-1)
62
+ """
63
+ if doc_type_key not in DOC_TYPE_REGISTRY:
64
+ logger.warning(f"Unknown doc type key: {doc_type_key}")
65
+ return []
66
+
67
+ # Get existing keywords for this type (base + learned)
68
+ existing = set(kw.lower() for kw in get_effective_keywords(doc_type_key))
69
+
70
+ # Get ALL keywords across ALL types to find what's unique to this type
71
+ all_other_keywords = set()
72
+ for key, dt in DOC_TYPE_REGISTRY.items():
73
+ if key != doc_type_key:
74
+ for kw in get_effective_keywords(key):
75
+ all_other_keywords.add(kw.lower())
76
+
77
+ # Extract phrases from all texts
78
+ phrase_counter = Counter()
79
+ file_presence = Counter() # Track in how many files each phrase appears
80
+
81
+ for text in texts:
82
+ if not text:
83
+ continue
84
+ found_in_this_file = set()
85
+ phrases = _extract_phrases(text)
86
+ for phrase in phrases:
87
+ phrase_lower = phrase.lower().strip()
88
+ if (
89
+ len(phrase_lower) >= MIN_PHRASE_LENGTH
90
+ and len(phrase_lower) <= MAX_PHRASE_LENGTH
91
+ and phrase_lower not in existing
92
+ and phrase_lower not in COMMON_STOPWORDS
93
+ and not phrase_lower.isdigit()
94
+ ):
95
+ phrase_counter[phrase_lower] += 1
96
+ if phrase_lower not in found_in_this_file:
97
+ file_presence[phrase_lower] += 1
98
+ found_in_this_file.add(phrase_lower)
99
+
100
+ # Score candidates
101
+ candidates = []
102
+ for phrase, count in phrase_counter.most_common(100):
103
+ files_in = file_presence.get(phrase, 0)
104
+ num_files = len(texts)
105
+
106
+ # Confidence scoring:
107
+ # - Higher if phrase appears in more files (consistent)
108
+ # - Higher if phrase is NOT in other doc types (distinctive)
109
+ # - Lower if it's very generic
110
+ # Confidence scoring logic
111
+ if num_files == 1:
112
+ # specialized scoring for single-file uploads (demo mode)
113
+ # base confidence for a unique word in 1 file is low
114
+ # frequency matters much more here
115
+ base_score = 0.4
116
+
117
+ # frequency bonus
118
+ freq_bonus = 0.0
119
+ if count >= 5: freq_bonus = 0.4 # 0.4 + 0.3 + 0.4 = 1.1 -> 1.0 (Green)
120
+ elif count >= 3: freq_bonus = 0.2 # 0.4 + 0.3 + 0.2 = 0.9 (Green)
121
+ elif count >= 2: freq_bonus = 0.1 # 0.4 + 0.3 + 0.1 = 0.8 (Green/Yellow border)
122
+ else: freq_bonus = 0.0 # 0.4 + 0.3 = 0.7 (Yellow)
123
+
124
+ confidence = base_score + uniqueness_bonus + freq_bonus
125
+ else:
126
+ # standard multi-file scoring
127
+ # consistency across files matters most
128
+ file_ratio = files_in / max(num_files, 1)
129
+ confidence = (file_ratio * 0.5) + uniqueness_bonus + (0.2 if count > 2 else 0.0)
130
+
131
+ confidence = min(confidence, 1.0)
132
+
133
+ if confidence >= 0.2: # Only suggest if minimally confident
134
+ candidates.append({
135
+ "keyword": phrase,
136
+ "frequency": count,
137
+ "files_found_in": files_in,
138
+ "confidence": round(confidence, 2),
139
+ "is_unique_to_type": is_unique,
140
+ })
141
+
142
+ # Sort by confidence desc, then frequency desc
143
+ candidates.sort(key=lambda x: (-x["confidence"], -x["frequency"]))
144
+
145
+ return candidates[:50] # Return top 50 candidates
146
+
147
+
148
+ def approve_keywords(
149
+ doc_type_key: str,
150
+ keywords: List[str],
151
+ ) -> Dict:
152
+ """
153
+ Approve candidate keywords and persist them to the learned registry.
154
+
155
+ Args:
156
+ doc_type_key: The doc type key
157
+ keywords: List of keyword strings to approve
158
+
159
+ Returns:
160
+ Dict with status and counts
161
+ """
162
+ if doc_type_key not in DOC_TYPE_REGISTRY:
163
+ return {"error": f"Unknown doc type: {doc_type_key}", "added": 0}
164
+
165
+ learned = load_learned_keywords()
166
+
167
+ if doc_type_key not in learned:
168
+ learned[doc_type_key] = []
169
+
170
+ added = 0
171
+ for kw in keywords:
172
+ kw_clean = kw.strip()
173
+ if kw_clean and kw_clean not in learned[doc_type_key]:
174
+ learned[doc_type_key].append(kw_clean)
175
+ added += 1
176
+
177
+ save_learned_keywords(learned)
178
+
179
+ total_keywords = len(get_effective_keywords(doc_type_key))
180
+
181
+ logger.info(
182
+ f"Approved {added} new keywords for '{doc_type_key}'. "
183
+ f"Total effective keywords: {total_keywords}"
184
+ )
185
+
186
+ return {
187
+ "doc_type": doc_type_key,
188
+ "added": added,
189
+ "total_learned": len(learned.get(doc_type_key, [])),
190
+ "total_effective": total_keywords,
191
+ }
192
+
193
+
194
+ def get_training_stats() -> Dict:
195
+ """
196
+ Get training statistics for the admin dashboard.
197
+
198
+ Returns:
199
+ Dict with per-type learned keyword counts and totals.
200
+ """
201
+ learned = load_learned_keywords()
202
+ stats = {
203
+ "total_learned_keywords": sum(len(v) for v in learned.values()),
204
+ "types_with_learned": len(learned),
205
+ "per_type": {},
206
+ }
207
+
208
+ for key, dt in DOC_TYPE_REGISTRY.items():
209
+ learned_count = len(learned.get(key, []))
210
+ if learned_count > 0:
211
+ stats["per_type"][key] = {
212
+ "display_name": dt.display_name,
213
+ "base_keywords": len(dt.keywords),
214
+ "learned_keywords": learned_count,
215
+ "total_keywords": len(dt.keywords) + learned_count,
216
+ }
217
+
218
+ return stats
219
+
220
+
221
+ # ============================================================================
222
+ # INTERNAL HELPERS
223
+ # ============================================================================
224
+
225
+ def _extract_phrases(text: str) -> List[str]:
226
+ """
227
+ Extract meaningful phrases from document text.
228
+
229
+ Looks for:
230
+ - Multi-word capitalized headers/labels (e.g., "TOTAL NET REVENUE")
231
+ - Key-value labels before colons (e.g., "Policy Number:")
232
+ - Table header-like strings
233
+ """
234
+ phrases = []
235
+
236
+ # 1. All-caps multi-word phrases (headers, labels)
237
+ caps_pattern = re.compile(r'\b([A-Z][A-Z\s&/\-\']{2,}[A-Z])\b')
238
+ for match in caps_pattern.finditer(text):
239
+ phrase = match.group(1).strip()
240
+ if len(phrase) >= MIN_PHRASE_LENGTH:
241
+ phrases.append(phrase)
242
+
243
+ # 2. Labels before colons
244
+ label_pattern = re.compile(r'([A-Za-z][\w\s&/\-]{2,})\s*:')
245
+ for match in label_pattern.finditer(text):
246
+ phrase = match.group(1).strip()
247
+ if len(phrase) >= MIN_PHRASE_LENGTH:
248
+ phrases.append(phrase)
249
+
250
+ # 3. Lines that look like section headers (Title Case at start of line)
251
+ for line in text.split('\n'):
252
+ line = line.strip()
253
+ if line and len(line) <= MAX_PHRASE_LENGTH:
254
+ # Title case or all caps, not a number
255
+ words = line.split()
256
+ if len(words) >= 2 and all(
257
+ w[0].isupper() or w in ('and', 'or', 'of', 'the', 'in', 'for', 'to', 'by', '&', '/', '-')
258
+ for w in words if w
259
+ ):
260
+ phrases.append(line)
261
+
262
+ return phrases
app/services/ingestion/learned_keywords.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
app/services/ingestion/mappings.py ADDED
@@ -0,0 +1,389 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data Mapper - Field name normalization for financial data.
3
+
4
+ Maps various field names from different file formats (CSV, PDF, XLSX)
5
+ to standardized internal field names.
6
+ """
7
+
8
+ from typing import Dict, List, Optional, Tuple
9
+
10
+
11
+ class DataMapper:
12
+ """
13
+ Maps raw field names to standardized internal field names.
14
+
15
+ Usage:
16
+ field = DataMapper.map_row("Total Revenue") # Returns "revenue"
17
+ field = DataMapper.map_row("Accounts Receivable") # Returns "accounts_receivable"
18
+ """
19
+
20
+ FIELD_MAPPING: Dict[str, List[str]] = {
21
+ # =================================================================
22
+ # INCOME STATEMENT
23
+ # =================================================================
24
+ "revenue": [
25
+ "revenue", "sales", "gross sales", "total revenue", "net sales",
26
+ "total net sales", "net revenue", "total sales", "service revenue",
27
+ "product revenue", "subscription revenue", "recurring revenue",
28
+ "operating revenue", "revenues, net", "revenues"
29
+ ],
30
+ "cogs": [
31
+ "cogs", "cost of goods", "direct costs", "cost of sales",
32
+ "cost of revenue", "cost of goods sold", "cost of products sold",
33
+ "cost of services", "direct cost", "cost of merchandise"
34
+ ],
35
+ "marketing_expenses": [
36
+ "marketing", "ad spend", "advertising", "marketing expense",
37
+ "promotion", "marketing and advertising", "advertising expense",
38
+ "marketing costs", "promotional expense", "customer acquisition"
39
+ ],
40
+ "payroll_expenses": [
41
+ "payroll", "salaries", "wages", "employee costs", "personnel",
42
+ "labor", "compensation", "salaries and wages", "employee benefits",
43
+ "stock compensation", "share-based compensation", "labor cost",
44
+ "wages and salaries", "staff costs"
45
+ ],
46
+ "rent_expense": [
47
+ "rent", "lease", "occupancy", "facilities", "rent expense",
48
+ "lease expense", "occupancy costs", "facility costs"
49
+ ],
50
+ "other_operating_expenses": [
51
+ "other expense", "operating expense", "sga", "general and administrative",
52
+ "g&a", "selling, general", "pre-opening", "impairment",
53
+ "administrative expense", "operating expenses", "other operating",
54
+ "research and development", "r&d", "utilities", "insurance"
55
+ ],
56
+ "depreciation": [
57
+ "depreciation", "depreciation expense", "depreciation and amortization"
58
+ ],
59
+ "amortization": [
60
+ "amortization", "amortization expense"
61
+ ],
62
+ "interest_expense": [
63
+ "interest", "interest expense", "finance costs", "interest cost",
64
+ "interest and finance charges", "borrowing costs"
65
+ ],
66
+ "taxes": [
67
+ "tax", "income tax", "taxes", "provision for taxes", "income tax expense",
68
+ "tax expense", "provision for income taxes"
69
+ ],
70
+
71
+ # =================================================================
72
+ # BALANCE SHEET - ASSETS
73
+ # =================================================================
74
+ "cash": [
75
+ "cash", "bank", "cash and equivalents", "cash & equivalents",
76
+ "cash and cash equivalents", "cash on hand", "short-term investments",
77
+ "cash, cash equivalents"
78
+ ],
79
+ "accounts_receivable": [
80
+ "accounts receivable", "ar", "receivables", "trade receivables",
81
+ "net receivables", "receivables, net", "trade accounts receivable"
82
+ ],
83
+ "inventory": [
84
+ "inventory", "stock", "merchandise", "inventories",
85
+ "merchandise inventory", "raw materials"
86
+ ],
87
+ "prepaid_expenses": [
88
+ "prepaid", "prepaid expenses", "other current assets",
89
+ "prepaid and other", "prepaids"
90
+ ],
91
+ "property_plant_equipment": [
92
+ "ppe", "fixed assets", "property plant equipment", "equipment",
93
+ "property, plant and equipment", "property and equipment",
94
+ "net property", "fixed assets, net", "capital assets"
95
+ ],
96
+ "accumulated_depreciation": [
97
+ "accumulated depreciation", "acc depreciation", "less depreciation"
98
+ ],
99
+ "intangible_assets": [
100
+ "intangible assets", "goodwill", "soft assets", "intangibles",
101
+ "goodwill and intangibles"
102
+ ],
103
+
104
+ # =================================================================
105
+ # BALANCE SHEET - LIABILITIES
106
+ # =================================================================
107
+ "accounts_payable": [
108
+ "accounts payable", "ap", "payables", "trade payables",
109
+ "trade accounts payable"
110
+ ],
111
+ "accrued_liabilities": [
112
+ "accrued liabilities", "accrued expenses", "accruals",
113
+ "accrued and other"
114
+ ],
115
+ "short_term_debt": [
116
+ "short term debt", "current portion of debt", "notes payable",
117
+ "current debt", "short-term borrowings", "current portion of long-term debt"
118
+ ],
119
+ "long_term_debt": [
120
+ "long term debt", "term loan", "non-current liabilities",
121
+ "long-term borrowings", "bonds payable", "notes payable long-term"
122
+ ],
123
+ "deferred_revenue": [
124
+ "deferred revenue", "unearned revenue", "contract liabilities",
125
+ "deferred income"
126
+ ],
127
+ "total_equity": [
128
+ "equity", "retained earnings", "shareholders equity", "total equity",
129
+ "stockholders equity", "shareholders' equity", "stockholders' equity",
130
+ "total shareholders equity", "net worth", "owner equity"
131
+ ],
132
+
133
+ # =================================================================
134
+ # CASH FLOW STATEMENT
135
+ # =================================================================
136
+ "operating_cash_flow": [
137
+ "operating cash flow", "cfo", "cash from operations",
138
+ "cash flow from operating activities", "net cash from operating",
139
+ "cash generated by operating activities", "operating activities",
140
+ "net cash provided by operating", "cash flows from operating"
141
+ ],
142
+ "capex": [
143
+ "capex", "capital expenditure", "purchase of property",
144
+ "additions to property", "capital expenditures",
145
+ "purchases of property", "property additions"
146
+ ],
147
+ "investing_cash_flow": [
148
+ "investing cash flow", "cash from investing",
149
+ "cash flow from investing activities", "investing activities",
150
+ "net cash from investing", "cash flows from investing"
151
+ ],
152
+ "financing_cash_flow": [
153
+ "financing cash flow", "cash from financing",
154
+ "cash flow from financing activities", "financing activities",
155
+ "net cash from financing", "cash flows from financing"
156
+ ],
157
+
158
+ # =================================================================
159
+ # OPERATING METRICS
160
+ # =================================================================
161
+ "new_customers": ["new customers", "customer additions", "new users"],
162
+ "total_transactions": ["transactions", "orders", "total orders"],
163
+ "total_seats": ["seats", "licenses", "subscriptions"],
164
+ "active_members": ["members", "active count", "active users"],
165
+ "restaurant_margin": ["restaurant margin", "store margin"],
166
+ "effective_tax_rate": ["effective tax rate", "tax rate"],
167
+ "churn_rate": ["churn", "churn rate", "attrition", "cancellation rate"],
168
+ "cac": ["cac", "acquisition cost", "customer acquisition cost"],
169
+ "ltv": ["ltv", "lifetime value", "cltv", "customer lifetime value"],
170
+
171
+ # =================================================================
172
+ # DERIVED / SUMMARY ITEMS (often in Excel templates)
173
+ # =================================================================
174
+ "gross_profit": [
175
+ "gross profit", "gross margin", "gross income"
176
+ ],
177
+ "operating_income": [
178
+ "operating income", "operating profit", "ebit", "income from operations"
179
+ ],
180
+ "net_income": [
181
+ "net income", "net profit", "net earnings", "net income attributable"
182
+ ],
183
+ "ebitda": [
184
+ "ebitda", "earnings before interest"
185
+ ],
186
+ "total_assets": [
187
+ "total assets", "assets total"
188
+ ],
189
+ "total_liabilities": [
190
+ "total liabilities", "liabilities total"
191
+ ],
192
+
193
+ # =================================================================
194
+ # SPECIALIZED DOCUMENT TYPE FIELDS
195
+ # =================================================================
196
+
197
+ # ARR / MRR (SaaS)
198
+ "beginning_arr": [
199
+ "beginning arr", "opening arr", "start arr", "bop arr"
200
+ ],
201
+ "new_logo_arr": [
202
+ "new logo arr", "new logos", "new business arr", "new customer arr"
203
+ ],
204
+ "expansion_arr": [
205
+ "expansion arr", "upsell", "cross-sell", "expansion revenue"
206
+ ],
207
+ "contraction_arr": [
208
+ "contraction arr", "downgrade", "contraction", "downsell"
209
+ ],
210
+ "churn_arr": [
211
+ "churn arr", "churned arr", "lost arr", "cancellation arr"
212
+ ],
213
+ "ending_arr": [
214
+ "ending arr", "closing arr", "exit arr", "eop arr"
215
+ ],
216
+
217
+ # Deferred Revenue / ASC 606
218
+ "beginning_balance": [
219
+ "beginning balance", "opening balance", "beginning deferred revenue"
220
+ ],
221
+ "billings": [
222
+ "billings", "invoiced", "new contracts billed", "fees billed"
223
+ ],
224
+ "revenue_recognized": [
225
+ "revenue recognized", "earned revenue", "satisfaction of performance obligation"
226
+ ],
227
+ "ending_balance": [
228
+ "ending balance", "closing balance", "ending deferred revenue"
229
+ ],
230
+
231
+ # Real Estate / NOI
232
+ "rental_revenue": [
233
+ "rental revenue", "rental income", "gross potential rent"
234
+ ],
235
+ "vacancy_loss": [
236
+ "vacancy loss", "vacancy", "credit loss", "vacancy & credit loss"
237
+ ],
238
+ "noi": [
239
+ "net operating income", "noi", "income before debt service"
240
+ ],
241
+ "management_fees": [
242
+ "management fees", "property management", "management expense"
243
+ ],
244
+
245
+ # Manufacturing / COGM
246
+ "direct_materials": [
247
+ "direct materials", "raw materials consumed", "material costs"
248
+ ],
249
+ "direct_labor": [
250
+ "direct labor", "manufacturing labor", "touch labor"
251
+ ],
252
+ "factory_overhead": [
253
+ "factory overhead", "manufacturing overhead", "indirect costs", "burden"
254
+ ],
255
+
256
+ # Energy / Oil & Gas
257
+ "proved_reserves": [
258
+ "proved reserves", "1p reserves", "total proved"
259
+ ],
260
+ "pv10_value": [
261
+ "pv-10", "present value at 10%", "discounted future net cash flows"
262
+ ],
263
+ "working_interest": [
264
+ "working interest", "wi %", "decimal interest"
265
+ ],
266
+ }
267
+
268
+ # Exclusion rules: (field, [terms that should NOT trigger this field])
269
+ EXCLUSIONS: Dict[str, List[str]] = {
270
+ "revenue": ["cost", "marketable securities", "deferred"],
271
+ "total_equity": ["awards", "liability", "liabilities", "debt"],
272
+ "cash": ["non-cash", "noncash"],
273
+ "depreciation": ["accum", "accumulated"],
274
+ }
275
+
276
+ # Field categories for validation
277
+ INCOME_FIELDS = [
278
+ "revenue", "cogs", "marketing_expenses", "payroll_expenses", "rent_expense",
279
+ "other_operating_expenses", "depreciation", "amortization", "interest_expense", "taxes",
280
+ "gross_profit", "operating_income", "net_income", "ebitda"
281
+ ]
282
+
283
+ BALANCE_FIELDS = [
284
+ "cash", "accounts_receivable", "inventory", "prepaid_expenses",
285
+ "property_plant_equipment", "accumulated_depreciation", "intangible_assets",
286
+ "accounts_payable", "accrued_liabilities", "short_term_debt", "long_term_debt",
287
+ "deferred_revenue", "total_equity", "total_assets", "total_liabilities"
288
+ ]
289
+
290
+ CASH_FIELDS = [
291
+ "operating_cash_flow", "capex", "investing_cash_flow", "financing_cash_flow"
292
+ ]
293
+
294
+ @staticmethod
295
+ def map_row(row_label: str) -> Optional[str]:
296
+ """
297
+ Map a raw field label to a standardized field name.
298
+
299
+ Args:
300
+ row_label: The raw label from the source file
301
+
302
+ Returns:
303
+ Standardized field name, or None if no match found
304
+ """
305
+ if not row_label:
306
+ return None
307
+
308
+ label_clean = str(row_label).lower().strip().replace("_", " ")
309
+
310
+ # Direct match check first
311
+ for field, aliases in DataMapper.FIELD_MAPPING.items():
312
+ if label_clean == field:
313
+ return field
314
+
315
+ # Fuzzy / keyword matching with longest match wins
316
+ best_match_field = None
317
+ best_match_len = 0
318
+
319
+ for field, aliases in DataMapper.FIELD_MAPPING.items():
320
+ for alias in aliases:
321
+ if alias in label_clean:
322
+ # Check exclusions
323
+ if field in DataMapper.EXCLUSIONS:
324
+ if any(excl in label_clean for excl in DataMapper.EXCLUSIONS[field]):
325
+ continue
326
+
327
+ # Longest alias match wins (more specific)
328
+ if len(alias) > best_match_len:
329
+ best_match_len = len(alias)
330
+ best_match_field = field
331
+
332
+ return best_match_field
333
+
334
+ @staticmethod
335
+ def map_row_with_confidence(row_label: str) -> Tuple[Optional[str], float]:
336
+ """
337
+ Map a row label and return confidence score.
338
+
339
+ Returns:
340
+ Tuple of (field_name, confidence) where confidence is 0.0-1.0
341
+ """
342
+ if not row_label:
343
+ return None, 0.0
344
+
345
+ label_clean = str(row_label).lower().strip().replace("_", " ")
346
+
347
+ # Exact match = 1.0 confidence
348
+ for field, aliases in DataMapper.FIELD_MAPPING.items():
349
+ if label_clean == field:
350
+ return field, 1.0
351
+ for alias in aliases:
352
+ if label_clean == alias:
353
+ return field, 1.0
354
+
355
+ # Partial match = proportional confidence
356
+ best_match_field = None
357
+ best_confidence = 0.0
358
+
359
+ for field, aliases in DataMapper.FIELD_MAPPING.items():
360
+ for alias in aliases:
361
+ if alias in label_clean:
362
+ # Check exclusions
363
+ if field in DataMapper.EXCLUSIONS:
364
+ if any(excl in label_clean for excl in DataMapper.EXCLUSIONS[field]):
365
+ continue
366
+
367
+ # Confidence based on how much of the label is matched
368
+ confidence = len(alias) / len(label_clean)
369
+ if confidence > best_confidence:
370
+ best_confidence = confidence
371
+ best_match_field = field
372
+
373
+ return best_match_field, min(best_confidence, 0.95) # Cap at 0.95 for non-exact
374
+
375
+ @staticmethod
376
+ def get_statement_type(field: str) -> Optional[str]:
377
+ """
378
+ Determine which financial statement a field belongs to.
379
+
380
+ Returns:
381
+ "income", "balance", "cash_flow", or None
382
+ """
383
+ if field in DataMapper.INCOME_FIELDS:
384
+ return "income"
385
+ elif field in DataMapper.BALANCE_FIELDS:
386
+ return "balance"
387
+ elif field in DataMapper.CASH_FIELDS:
388
+ return "cash_flow"
389
+ return None
app/services/ingestion/parser_csv.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import re
3
+ from typing import Dict, Any, Optional
4
+ from app.schemas.financial import (
5
+ FinancialReport,
6
+ BalanceSheetStandard,
7
+ IncomeStatementStandard,
8
+ CashFlowStandard,
9
+ OperatingMetrics,
10
+ PeriodType,
11
+ Currency
12
+ )
13
+ from datetime import date
14
+
15
+ from app.services.ingestion.mappings import DataMapper
16
+
17
+ class CSVParser:
18
+ @staticmethod
19
+ def parse(file_path: str) -> FinancialReport:
20
+ df = pd.read_csv(file_path)
21
+
22
+ # Logic to handle different CSV structures
23
+ # Case 1: Transposed (Item, Value)
24
+ # Case 2: Standard (Columns are periods, Rows are Items) -> We take the most recent column
25
+
26
+ data_dict = {}
27
+
28
+ # Check if columns themselves are headers (Horizontal Format)
29
+ # We look for at least 3 matching fields in columns to confirm
30
+ matches = 0
31
+ for col in df.columns:
32
+ if DataMapper.map_row(str(col)):
33
+ matches += 1
34
+
35
+ if matches >= 3:
36
+ # Horizontal Format: Take the last row (most recent data)
37
+ # Assumption: columns are fields
38
+ last_row = df.iloc[-1]
39
+ for col in df.columns:
40
+ field = DataMapper.map_row(str(col))
41
+ if field:
42
+ val_raw = last_row[col]
43
+ # Clean value with signage normalization
44
+ if isinstance(val_raw, str):
45
+ s = val_raw.strip().replace("$", "").replace(",", "").replace(" ", "")
46
+ if "(" in s and ")" in s:
47
+ s = s.replace("(", "-").replace(")", "")
48
+ if s in ("-", "—", ""):
49
+ val = 0.0
50
+ else:
51
+ try: val = float(s)
52
+ except: val = 0.0
53
+ else:
54
+ val = float(val_raw) if pd.notnull(val_raw) else 0.0
55
+ data_dict[field] = val
56
+
57
+ # Fallback to Vertical (Key-Value) Format
58
+ elif len(df.columns) >= 2:
59
+ # Assume col 0 is label, col 1 is current period value
60
+ for _, row in df.iterrows():
61
+ label = str(row[0])
62
+ # Try col 1, if nan try col 2? For now strict col 1
63
+ val_raw = row[1]
64
+
65
+ # Clean value with signage normalization
66
+ if isinstance(val_raw, str):
67
+ s = val_raw.strip().replace("$", "").replace(",", "").replace(" ", "")
68
+ # Handle (123) as negative (accounting convention)
69
+ if "(" in s and ")" in s:
70
+ s = s.replace("(", "-").replace(")", "")
71
+ # Handle dash/em-dash as zero
72
+ if s in ("-", "—", ""):
73
+ val = 0.0
74
+ else:
75
+ try: val = float(s)
76
+ except: val = 0.0
77
+ else:
78
+ val = float(val_raw) if pd.notnull(val_raw) else 0.0
79
+
80
+ field = DataMapper.map_row(label)
81
+ if field:
82
+ data_dict[field] = val
83
+
84
+ def get(key, default=0.0):
85
+ return data_dict.get(key, default)
86
+
87
+ income = IncomeStatementStandard(
88
+ revenue=get("revenue"),
89
+ cogs=get("cogs"),
90
+ marketing_expenses=get("marketing_expenses"),
91
+ payroll_expenses=get("payroll_expenses"),
92
+ rent_expense=get("rent_expense"),
93
+ other_operating_expenses=get("other_operating_expenses"),
94
+ depreciation=get("depreciation"),
95
+ amortization=get("amortization"),
96
+ interest_expense=get("interest_expense"),
97
+ taxes=get("taxes")
98
+ )
99
+
100
+ balance = BalanceSheetStandard(
101
+ cash=get("cash"),
102
+ accounts_receivable=get("accounts_receivable"),
103
+ inventory=get("inventory"),
104
+ prepaid_expenses=get("prepaid_expenses"),
105
+ property_plant_equipment=get("property_plant_equipment"),
106
+ accumulated_depreciation=get("accumulated_depreciation"),
107
+ intangible_assets=get("intangible_assets"),
108
+ accounts_payable=get("accounts_payable"),
109
+ accrued_liabilities=get("accrued_liabilities"),
110
+ short_term_debt=get("short_term_debt"),
111
+ long_term_debt=get("long_term_debt"),
112
+ deferred_revenue=get("deferred_revenue"),
113
+ total_equity=get("total_equity")
114
+ )
115
+
116
+ cash_flow = CashFlowStandard(
117
+ operating_cash_flow=get("operating_cash_flow"),
118
+ capex=get("capex"),
119
+ investing_cash_flow=get("investing_cash_flow"),
120
+ financing_cash_flow=get("financing_cash_flow")
121
+ )
122
+
123
+ metrics = OperatingMetrics(
124
+ industry='general', # Default, could extract from metadata
125
+ new_customers=int(get("new_customers")) if get("new_customers") else None,
126
+ total_transactions=int(get("total_transactions")) if get("total_transactions") else None,
127
+ total_seats=int(get("total_seats")) if get("total_seats") else None
128
+ )
129
+
130
+ return FinancialReport(
131
+ company_name="Imported Company",
132
+ period_end=date.today(),
133
+ period_type=PeriodType.ANNUAL,
134
+ currency=Currency.USD,
135
+ income_statement=income,
136
+ balance_sheet=balance,
137
+ cash_flow=cash_flow,
138
+ metrics=metrics
139
+ )
app/services/ingestion/parser_dolphin.py ADDED
@@ -0,0 +1,471 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Hybrid PDF Parser — Engines-First Classification Pipeline.
3
+
4
+ Both engines extract text simultaneously, then the combined text
5
+ is classified against 53 document types using keyword scoring.
6
+
7
+ Stage 1: Both engines extract text (Dolphin + pdfplumber, parallel)
8
+ Stage 2: Combined text → keyword scan against 53 doc types (80% threshold)
9
+ Stage 3: Doc-type-aware targeted extraction (using field list)
10
+ Stage 4: Merge extractions (Dolphin priority, pdfplumber gap-fill)
11
+ Stage 5: Standardize & build FinancialReport
12
+ """
13
+
14
+ import logging
15
+ import re
16
+ from typing import Dict, Any, Optional, List
17
+ from datetime import date
18
+
19
+ from app.schemas.financial import (
20
+ FinancialReport,
21
+ BalanceSheetStandard,
22
+ IncomeStatementStandard,
23
+ CashFlowStandard,
24
+ OperatingMetrics,
25
+ PeriodType,
26
+ Currency,
27
+ )
28
+ from app.services.ingestion.mappings import DataMapper
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ class HybridPDFParser:
34
+ """
35
+ Hybrid parser that combines Dolphin-v2 deep parsing with pdfplumber
36
+ gap-filling on every PDF for maximum extraction coverage.
37
+
38
+ Implements the same `parse(file_path) -> FinancialReport` interface
39
+ as the original PDFParser.
40
+ """
41
+
42
+ @staticmethod
43
+ def parse(file_path: str) -> FinancialReport:
44
+ """
45
+ Parse a PDF using the engines-first classification pipeline.
46
+
47
+ Stages:
48
+ 1. Both engines extract text simultaneously
49
+ 2. Combined text → classify against 53 doc types (80% threshold)
50
+ 3. Type-aware targeted extraction
51
+ 4. Merge extractions (Dolphin priority, pdfplumber gap-fill)
52
+ 5. Build FinancialReport with classification metadata
53
+
54
+ Falls back to pdfplumber-only if Dolphin is unavailable.
55
+ """
56
+ dolphin_data = {}
57
+ pdfplumber_data = {}
58
+ classification = None
59
+ dolphin_company_name = None
60
+ dolphin_fiscal_year = None
61
+ dolphin_text = ""
62
+ doc_result = None
63
+ extraction_method = "pdfplumber"
64
+
65
+ # =================================================================
66
+ # Stage 1: Both engines extract text simultaneously
67
+ # =================================================================
68
+ logger.info("Stage 1: Extracting text from both engines")
69
+
70
+ # 1a. Dolphin text extraction
71
+ try:
72
+ from app.services.ingestion.dolphin import is_dolphin_available
73
+
74
+ if is_dolphin_available():
75
+ from app.services.ingestion.dolphin.client import DolphinClient
76
+ from app.services.ingestion.dolphin.extractor import DolphinExtractor
77
+
78
+ client = DolphinClient.create()
79
+ doc_result = client.parse_document(file_path)
80
+ dolphin_text = doc_result.full_markdown if doc_result.total_pages > 0 else ""
81
+ dolphin_company_name = DolphinExtractor.extract_company_name(doc_result) if doc_result.total_pages > 0 else None
82
+ dolphin_fiscal_year = DolphinExtractor.extract_fiscal_year(doc_result) if doc_result.total_pages > 0 else None
83
+ extraction_method = "dolphin_hybrid"
84
+ logger.info(f"Dolphin extracted {len(dolphin_text)} chars from {doc_result.total_pages} pages")
85
+ else:
86
+ logger.info("Dolphin not available — pdfplumber-only mode")
87
+ except Exception as e:
88
+ logger.warning(f"Dolphin text extraction failed: {e}")
89
+
90
+ # 1b. pdfplumber text extraction
91
+ pdfplumber_text = ""
92
+ try:
93
+ import pdfplumber
94
+ with pdfplumber.open(file_path) as pdf:
95
+ for page in pdf.pages:
96
+ page_text = page.extract_text()
97
+ if page_text:
98
+ pdfplumber_text += page_text + "\n"
99
+ logger.info(f"pdfplumber extracted {len(pdfplumber_text)} chars")
100
+ except Exception as e:
101
+ logger.warning(f"pdfplumber text extraction failed: {e}")
102
+
103
+ # =================================================================
104
+ # Stage 2: Combined text → classify against 53 doc types
105
+ # =================================================================
106
+ combined_text = f"{dolphin_text}\n{pdfplumber_text}"
107
+ logger.info(f"Stage 2: Classifying {len(combined_text)} chars against 53 doc types")
108
+
109
+ from app.services.ingestion.dolphin.classifier import DocumentClassifier
110
+
111
+ # Collect Dolphin sections if available
112
+ all_sections = []
113
+ if doc_result and doc_result.total_pages > 0:
114
+ for layout in doc_result.layouts:
115
+ all_sections.extend(layout.sections)
116
+
117
+ classification = DocumentClassifier.classify(
118
+ text_content=combined_text,
119
+ dolphin_sections=all_sections,
120
+ )
121
+
122
+ logger.info(
123
+ f"Classified as '{classification.doc_type}' "
124
+ f"({classification.doc_type_display}) — "
125
+ f"{classification.match_percentage}% match, "
126
+ f"confidence={classification.confidence:.2f}, "
127
+ f"needs_review={classification.needs_review}"
128
+ )
129
+
130
+ # =================================================================
131
+ # Stage 3: Type-aware targeted extraction
132
+ # =================================================================
133
+ logger.info(f"Stage 3: Extracting data (type-aware: {classification.doc_type})")
134
+
135
+ # 3a. Dolphin structured extraction
136
+ if doc_result and doc_result.total_pages > 0:
137
+ try:
138
+ from app.services.ingestion.dolphin.extractor import DolphinExtractor
139
+ dolphin_data = DolphinExtractor.extract(doc_result, classification)
140
+ except Exception as e:
141
+ logger.warning(f"Dolphin extraction failed: {e}")
142
+
143
+ # 3b. pdfplumber targeted extraction
144
+ pdfplumber_data = HybridPDFParser._run_pdfplumber_extraction(
145
+ file_path, classification
146
+ )
147
+
148
+ # =================================================================
149
+ # Stage 4: Merge — Dolphin priority, pdfplumber gap-fill
150
+ # =================================================================
151
+ merged_data = HybridPDFParser._merge_extractions(dolphin_data, pdfplumber_data)
152
+
153
+ logger.info(
154
+ f"Stage 4: Merged {len(dolphin_data)} Dolphin + "
155
+ f"{len(pdfplumber_data)} pdfplumber → "
156
+ f"{len(merged_data)} total fields"
157
+ )
158
+
159
+ # =================================================================
160
+ # Stage 5: Build FinancialReport
161
+ # =================================================================
162
+ return HybridPDFParser._build_report(
163
+ extracted_data=merged_data,
164
+ text_content=combined_text,
165
+ file_path=file_path,
166
+ extraction_method=extraction_method,
167
+ classification=classification,
168
+ dolphin_company_name=dolphin_company_name,
169
+ dolphin_fiscal_year=dolphin_fiscal_year,
170
+ )
171
+
172
+ # ==================================================================
173
+ # Stage Implementations
174
+ # ==================================================================
175
+
176
+ @staticmethod
177
+ def _run_pdfplumber_extraction(file_path: str, classification=None):
178
+ """
179
+ pdfplumber targeted extraction — tables + regex.
180
+
181
+ Uses classification to guide which statement types to look for.
182
+ Reuses the proven logic from the existing PDFParser.
183
+ """
184
+ from app.services.ingestion.parser_pdf import PDFParser
185
+ from app.services.ingestion.dolphin.classifier import DocumentClassifier
186
+ import pdfplumber
187
+
188
+ extracted_data = {}
189
+
190
+ try:
191
+ with pdfplumber.open(file_path) as pdf:
192
+ # Determine which statement types to extract based on classification
193
+ if classification:
194
+ target_types = DocumentClassifier.get_financial_statement_types(classification)
195
+ else:
196
+ target_types = ["income", "balance", "cash_flow"]
197
+
198
+ # Statement page locator
199
+ statement_pages = PDFParser._find_statement_pages(pdf)
200
+
201
+ # Extract from identified statement pages
202
+ for stmt_type, page in statement_pages.items():
203
+ if stmt_type not in target_types:
204
+ continue # Skip statement types not relevant to this doc
205
+
206
+ allowed_fields = None
207
+ if stmt_type == "income":
208
+ allowed_fields = DataMapper.INCOME_FIELDS
209
+ elif stmt_type == "balance":
210
+ allowed_fields = DataMapper.BALANCE_FIELDS
211
+ elif stmt_type == "cash_flow":
212
+ allowed_fields = DataMapper.CASH_FIELDS
213
+
214
+ table_data = PDFParser._extract_table_data(page, allowed_fields)
215
+ extracted_data.update(table_data)
216
+
217
+ # Full text for regex fallback
218
+ text_content = ""
219
+ for page in pdf.pages:
220
+ page_text = page.extract_text()
221
+ if page_text:
222
+ text_content += page_text + "\n"
223
+
224
+ # Regex fallback for missing fields
225
+ regex_data = PDFParser._extract_via_regex(
226
+ text_content, existing_keys=extracted_data.keys()
227
+ )
228
+ extracted_data.update(regex_data)
229
+
230
+ except Exception as e:
231
+ logger.warning(f"pdfplumber extraction failed: {e}")
232
+
233
+ return extracted_data
234
+
235
+ @staticmethod
236
+ def _merge_extractions(
237
+ dolphin_data: Dict[str, Any],
238
+ pdfplumber_data: Dict[str, Any],
239
+ ) -> Dict[str, Any]:
240
+ """
241
+ Merge Dolphin and pdfplumber extractions.
242
+
243
+ Priority: Dolphin fields take precedence.
244
+ pdfplumber fills any gaps not covered by Dolphin.
245
+ """
246
+ merged = dict(dolphin_data) # Start with Dolphin data
247
+
248
+ for key, value in pdfplumber_data.items():
249
+ if key not in merged:
250
+ merged[key] = value
251
+ elif merged[key] == 0.0 and value != 0.0:
252
+ # If Dolphin gave 0 but pdfplumber found a value, prefer pdfplumber
253
+ merged[key] = value
254
+
255
+ return merged
256
+
257
+ # ==================================================================
258
+ # Report Construction (mirrors PDFParser logic)
259
+ # ==================================================================
260
+
261
+ @staticmethod
262
+ def _build_report(
263
+ extracted_data: Dict,
264
+ text_content: str,
265
+ file_path: str,
266
+ extraction_method: str,
267
+ classification=None,
268
+ dolphin_company_name: Optional[str] = None,
269
+ dolphin_fiscal_year: Optional[str] = None,
270
+ ) -> FinancialReport:
271
+ """Build a FinancialReport from merged extracted data."""
272
+
273
+ def get(key, default=0.0):
274
+ val = extracted_data.get(key)
275
+ return val if val is not None else default
276
+
277
+ # --- Income Statement ---
278
+ revenue = get("revenue")
279
+ cogs = get("cogs")
280
+ marketing = get("marketing_expenses")
281
+ payroll = get("payroll_expenses")
282
+ rent = get("rent_expense")
283
+ other = get("other_operating_expenses")
284
+ depreciation = get("depreciation")
285
+ amortization = get("amortization")
286
+ interest = get("interest_expense")
287
+ taxes = get("taxes")
288
+
289
+ op_expenses = marketing + payroll + rent + other
290
+ gross_profit = revenue - cogs
291
+ ebitda = gross_profit - op_expenses
292
+ op_income = ebitda - depreciation - amortization
293
+ net_income = op_income - interest - taxes
294
+
295
+ income = IncomeStatementStandard(
296
+ revenue=revenue, cogs=cogs,
297
+ marketing_expenses=marketing, payroll_expenses=payroll,
298
+ rent_expense=rent, other_operating_expenses=other,
299
+ depreciation=depreciation, amortization=amortization,
300
+ interest_expense=interest, taxes=taxes,
301
+ operating_expenses=op_expenses, gross_profit=gross_profit,
302
+ ebitda=ebitda, operating_income=op_income, net_income=net_income,
303
+ )
304
+
305
+ # --- Balance Sheet ---
306
+ cash = get("cash")
307
+ ar = get("accounts_receivable")
308
+ inv = get("inventory")
309
+ prepaid = get("prepaid_expenses")
310
+ ppe = get("property_plant_equipment")
311
+ accum_dep = get("accumulated_depreciation")
312
+ intangibles = get("intangible_assets")
313
+ ap = get("accounts_payable")
314
+ accrued = get("accrued_liabilities")
315
+ st_debt = get("short_term_debt")
316
+ lt_debt = get("long_term_debt")
317
+ deferred = get("deferred_revenue")
318
+ equity = get("total_equity")
319
+
320
+ bs_current_assets = cash + ar + inv + prepaid
321
+ bs_total_assets = bs_current_assets + (ppe - accum_dep) + intangibles
322
+ bs_current_liab = ap + accrued + st_debt
323
+ bs_total_liab = bs_current_liab + lt_debt + deferred
324
+
325
+ balance = BalanceSheetStandard(
326
+ cash=cash, accounts_receivable=ar, inventory=inv,
327
+ prepaid_expenses=prepaid, property_plant_equipment=ppe,
328
+ accumulated_depreciation=accum_dep, intangible_assets=intangibles,
329
+ accounts_payable=ap, accrued_liabilities=accrued,
330
+ short_term_debt=st_debt, long_term_debt=lt_debt,
331
+ deferred_revenue=deferred, total_equity=equity,
332
+ total_current_assets=bs_current_assets, total_assets=bs_total_assets,
333
+ total_current_liabilities=bs_current_liab, total_liabilities=bs_total_liab,
334
+ )
335
+
336
+ # --- Cash Flow ---
337
+ cash_flow = CashFlowStandard(
338
+ operating_cash_flow=get("operating_cash_flow"),
339
+ capex=get("capex"),
340
+ investing_cash_flow=get("investing_cash_flow"),
341
+ financing_cash_flow=get("financing_cash_flow"),
342
+ net_change_in_cash=get("net_change_in_cash"),
343
+ )
344
+
345
+ # --- Operating Metrics ---
346
+ metrics = OperatingMetrics(
347
+ industry="restaurant" if get("restaurant_margin") else "general",
348
+ new_customers=int(get("new_customers")) if get("new_customers") else None,
349
+ total_transactions=int(get("total_transactions")) if get("total_transactions") else None,
350
+ total_seats=int(get("total_seats")) if get("total_seats") else None,
351
+ churn_rate=get("churn_rate") if get("churn_rate") else None,
352
+ cac=get("cac") if get("cac") else None,
353
+ ltv=get("ltv") if get("ltv") else None,
354
+ )
355
+
356
+ # --- Metadata ---
357
+ metadata = {
358
+ "extraction_method": extraction_method,
359
+ "extracted_restaurant_margin": str(get("restaurant_margin")),
360
+ "extracted_effective_tax_rate": str(get("effective_tax_rate")),
361
+ }
362
+
363
+ if classification:
364
+ metadata["document_type"] = classification.doc_type
365
+ metadata["document_type_display"] = getattr(classification, 'doc_type_display', '')
366
+ metadata["classification_confidence"] = str(classification.confidence)
367
+ metadata["match_percentage"] = str(getattr(classification, 'match_percentage', 0.0))
368
+ metadata["needs_review"] = str(getattr(classification, 'needs_review', False))
369
+ metadata["matched_keywords"] = ",".join(getattr(classification, 'matched_keywords', [])[:20])
370
+ metadata["extractable_fields"] = ",".join(getattr(classification, 'extractable_fields', []))
371
+ metadata["detected_sections"] = ",".join(classification.detected_sections)
372
+ metadata["secondary_types"] = ",".join(getattr(classification, 'secondary_types', []))
373
+
374
+ # --- Company Name ---
375
+ company_name = HybridPDFParser._resolve_company_name(
376
+ dolphin_name=dolphin_company_name,
377
+ text_content=text_content,
378
+ file_path=file_path,
379
+ )
380
+
381
+ # --- Fiscal Year ---
382
+ fiscal_year_date = HybridPDFParser._resolve_fiscal_year(
383
+ dolphin_year=dolphin_fiscal_year,
384
+ text_content=text_content,
385
+ )
386
+
387
+ return FinancialReport(
388
+ company_name=company_name,
389
+ period_end=fiscal_year_date,
390
+ period_type=PeriodType.ANNUAL,
391
+ currency=Currency.USD,
392
+ income_statement=income,
393
+ balance_sheet=balance,
394
+ cash_flow=cash_flow,
395
+ metrics=metrics,
396
+ metadata=metadata,
397
+ )
398
+
399
+ # ==================================================================
400
+ # Name & Date Resolution
401
+ # ==================================================================
402
+
403
+ @staticmethod
404
+ def _resolve_company_name(
405
+ dolphin_name: Optional[str],
406
+ text_content: str,
407
+ file_path: str,
408
+ ) -> str:
409
+ """Resolve company name: Dolphin → text heuristics → filename."""
410
+ if dolphin_name:
411
+ return dolphin_name
412
+
413
+ # Reuse the existing PDFParser heuristics
414
+ from app.services.ingestion.parser_pdf import PDFParser
415
+ # We can't call PDFParser's name extraction directly (it's inline),
416
+ # so replicate the core logic:
417
+
418
+ lines = text_content.split("\n")
419
+ ignored = {
420
+ "TABLE OF CONTENTS", "CONTENTS", "INDEX", "FINANCIAL STATEMENTS",
421
+ "CONSOLIDATED FINANCIAL STATEMENTS", "ANNUAL REPORT", "QUARTERLY REPORT",
422
+ "10-K", "10-Q", "FORM 10-K", "FORM 10-Q", "UNITED STATES",
423
+ "SECURITIES AND EXCHANGE COMMISSION", "WASHINGTON", "D.C.",
424
+ }
425
+
426
+ # SEC filing heuristic
427
+ registrant_idx = -1
428
+ for i, line in enumerate(lines[:100]):
429
+ if "exact name of registrant" in line.lower():
430
+ registrant_idx = i
431
+ break
432
+
433
+ if registrant_idx > 0:
434
+ for j in range(registrant_idx - 1, -1, -1):
435
+ candidate = lines[j].strip()
436
+ if len(candidate) > 2 and not any(ig in candidate.upper() for ig in ignored):
437
+ return candidate[:100]
438
+
439
+ # First meaningful line
440
+ for line in lines[:40]:
441
+ candidate = line.strip()
442
+ if (
443
+ len(candidate) > 2
444
+ and not any(ig in candidate.upper() for ig in ignored)
445
+ and not candidate.isdigit()
446
+ and any(c.isalpha() for c in candidate)
447
+ ):
448
+ return candidate[:100]
449
+
450
+ # Filename fallback
451
+ import os
452
+ basename = os.path.basename(file_path)
453
+ return os.path.splitext(basename)[0].replace("-", " ").replace("_", " ")
454
+
455
+ @staticmethod
456
+ def _resolve_fiscal_year(
457
+ dolphin_year: Optional[str],
458
+ text_content: str,
459
+ ) -> date:
460
+ """Resolve fiscal year: Dolphin → text patterns → today."""
461
+ # Try Dolphin result first
462
+ if dolphin_year:
463
+ year_match = re.search(r"\d{4}", dolphin_year)
464
+ if year_match:
465
+ y = int(year_match.group(0))
466
+ if 1990 <= y <= date.today().year + 1:
467
+ return date(y, 12, 31)
468
+
469
+ # Reuse PDFParser's fiscal year extraction
470
+ from app.services.ingestion.parser_pdf import PDFParser
471
+ return PDFParser._extract_fiscal_year(text_content)
app/services/ingestion/parser_pdf.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdfplumber
2
+ import re
3
+ from typing import Dict, Any, Optional, List
4
+ from app.schemas.financial import (
5
+ FinancialReport,
6
+ BalanceSheetStandard,
7
+ IncomeStatementStandard,
8
+ CashFlowStandard,
9
+ OperatingMetrics,
10
+ PeriodType,
11
+ Currency
12
+ )
13
+ from datetime import date
14
+ from app.services.ingestion.mappings import DataMapper
15
+
16
+ class PDFParser:
17
+ @staticmethod
18
+ def parse(file_path: str) -> FinancialReport:
19
+ """
20
+ Delegates to HybridPDFParser to enable AI-enhanced extraction
21
+ with automatic fallback to standard pdfplumber logic.
22
+ """
23
+ # Lazy import to avoid circular dependency
24
+ from app.services.ingestion.dolphin.parser_dolphin import HybridPDFParser
25
+ return HybridPDFParser.parse(file_path)
26
+
27
+ @staticmethod
28
+ def _finalize_report(name, income, balance, cash, metrics, meta, period_end):
29
+ """Helper to construct the final object"""
30
+ return FinancialReport(
31
+ company_name=name,
32
+ period_end=period_end,
33
+ period_type=PeriodType.ANNUAL,
34
+ currency=Currency.USD,
35
+ income_statement=income,
36
+ balance_sheet=balance,
37
+ cash_flow=cash,
38
+ metrics=metrics,
39
+ metadata=meta
40
+ )
41
+
42
+ @staticmethod
43
+ def _extract_fiscal_year(text: str) -> date:
44
+ """Finds the fiscal year end date from the text."""
45
+ # Pattern 1: Year Ended December 31, 2024
46
+ # Pattern 2: Period Ended ...
47
+ patterns = [
48
+ r"(?:YEAR|PERIOD|FISCAL YEAR)\s+ENDED\s+([A-Z]+\s+\d{1,2},\s+\d{4})",
49
+ r"DECEMBER\s+31,\s+(\d{4})"
50
+ ]
51
+
52
+ current_year = date.today().year
53
+ found_years = []
54
+
55
+ for pat in patterns:
56
+ matches = re.findall(pat, text[:5000], re.IGNORECASE) # Search first 5000 chars
57
+ for m in matches:
58
+ if isinstance(m, tuple): m = m[0]
59
+ # Extract year digit
60
+ year_match = re.search(r"\d{4}", m)
61
+ if year_match:
62
+ y = int(year_match.group(0))
63
+ if 1990 <= y <= current_year + 1:
64
+ found_years.append(y)
65
+
66
+ if found_years:
67
+ # Most frequent or max year? Usually max year in the header is the current report year.
68
+ best_year = max(found_years)
69
+ return date(best_year, 12, 31) # Default to Dec 31
70
+
71
+ return date.today()
72
+
73
+ @staticmethod
74
+ def _find_statement_pages(pdf) -> Dict[str, Any]:
75
+ """ Identifies pages containing specific financial statements. """
76
+ pages = {}
77
+ for page in pdf.pages:
78
+ text = (page.extract_text() or "").upper()
79
+
80
+ # Skip Table of Contents pages (unless they contain financial data like '$')
81
+ if ("TABLE OF CONTENTS" in text[:500] or "INDEX" in text[:200]) and "$" not in text[:2000]:
82
+ continue
83
+
84
+ # Expanded Keywords
85
+ # Income
86
+ if any(x in text for x in ["CONSOLIDATED STATEMENTS OF OPERATIONS", "CONSOLIDATED STATEMENTS OF INCOME", "CONSOLIDATED STATEMENTS OF EARNINGS", "DISSOLIDATED STATEMENTS OF LOSS", "STATEMENT OF INCOME", "STATEMENTS OF OPERATIONS"]):
87
+ if "income" not in pages: pages["income"] = page
88
+
89
+ # Balance
90
+ elif any(x in text for x in ["CONSOLIDATED BALANCE SHEETS", "CONSOLIDATED STATEMENTS OF FINANCIAL POSITION", "BALANCE SHEETS", "FINANCIAL POSITION"]):
91
+ if "balance" not in pages: pages["balance"] = page
92
+
93
+ # Cash Flow
94
+ elif any(x in text for x in ["CONSOLIDATED STATEMENTS OF CASH FLOWS", "CONSOLIDATED STATEMENT OF CASH FLOWS", "STATEMENTS OF CASH FLOWS", "CASH FLOWS"]):
95
+ if "cash_flow" not in pages: pages["cash_flow"] = page
96
+
97
+ return pages
98
+
99
+ @staticmethod
100
+ def _extract_table_data(page, allowed_fields: Optional[List[str]] = None) -> Dict[str, float]:
101
+ """ Extracts key-value pairs from tables on the page with smart column selection. """
102
+ data = {}
103
+ tables = page.extract_tables()
104
+
105
+ for table in tables:
106
+ # 1. Identify "Current Year" Column
107
+ # Scan first 5 rows for years (e.g., 2024, 2023)
108
+ target_col_idx = -1
109
+ max_year = 0
110
+
111
+ headers = table[:5]
112
+ for row in headers:
113
+ for idx, cell in enumerate(row):
114
+ if not cell: continue
115
+ # Look for year pattern
116
+ # Check for 4 digits that look like a recent year
117
+ cleaned = cell.replace("$", "").strip()
118
+ if re.match(r"^\d{4}$", cleaned):
119
+ y = int(cleaned)
120
+ if 2000 < y < 2100:
121
+ if y > max_year:
122
+ max_year = y
123
+ target_col_idx = idx
124
+
125
+ # If no year found, default to finding first numeric column later
126
+
127
+ # 2. Header-based Scaling Detection
128
+ # Look for "(in thousands)", "(in millions)", "($ in millions)", etc.
129
+ multiplier = 1.0
130
+
131
+ # Scan top of page text (first 1000 chars) or table headers
132
+ header_text = (page.extract_text() or "")[:1000].lower()
133
+ if "in millions" in header_text or "in 000s" in header_text.replace(",", ""):
134
+ # Distinct from "in thousands" - some 10ks say "in 000s" meaning thousands, but let's stick to standard text
135
+ pass
136
+
137
+ if re.search(r"\(in millions\)|in millions, except|dollares en millones|amounts in millions|dollars in millions", header_text):
138
+ multiplier = 1000000.0
139
+ elif re.search(r"\(in thousands\)|in thousands, except|dollares en miles|amounts in thousands|dollars in thousands|\(in 000s\)", header_text):
140
+ multiplier = 1000.0
141
+
142
+ # Override if strict detected
143
+ print(f"Detected scale multiplier: {multiplier}")
144
+
145
+ for row in table:
146
+ if not row or not row[0]: continue
147
+
148
+ label = row[0]
149
+ mapped_field = DataMapper.map_row(label)
150
+
151
+ if mapped_field:
152
+ if allowed_fields is not None and mapped_field not in allowed_fields:
153
+ continue
154
+
155
+ # Extract Value
156
+ val = None
157
+ if target_col_idx != -1 and target_col_idx < len(row):
158
+ # TRUSTED COLUMN
159
+ val = PDFParser._clean_value(row[target_col_idx])
160
+ else:
161
+ # FALLBACK: First numeric column
162
+ for col_val in row[1:]:
163
+ clean_val = PDFParser._clean_value(col_val)
164
+ if clean_val is not None:
165
+ val = clean_val
166
+ break
167
+
168
+ if val is not None:
169
+ data[mapped_field] = val * multiplier
170
+ return data
171
+
172
+ @staticmethod
173
+ def _clean_value(val_str: Optional[str]) -> Optional[float]:
174
+ """ Converts financial string formats to float. Handles parentheses for negative. """
175
+ if not val_str:
176
+ return None
177
+
178
+ s = val_str.strip().replace("$", "").replace(",", "").replace(" ", "")
179
+ if not s:
180
+ return None
181
+
182
+ # Handle (123) as negative
183
+ if "(" in s and ")" in s:
184
+ s = s.replace("(", "-").replace(")", "")
185
+
186
+ # Handle - as 0 (accounting format sometimes uses - for 0)
187
+ if s == "-" or s == "—":
188
+ return 0.0
189
+
190
+ try:
191
+ return float(s)
192
+ except ValueError:
193
+ return None
194
+
195
+ @staticmethod
196
+ def _extract_via_regex(text_content: str, existing_keys: List[str]) -> Dict[str, float]:
197
+ """ Fallback extraction for items not found in tables. """
198
+ data = {}
199
+ # Iterate over all mappings, skip if already found
200
+ for field, aliases in DataMapper.FIELD_MAPPING.items():
201
+ if field in existing_keys:
202
+ continue
203
+
204
+ for k in aliases:
205
+ # Regex matches "Keyword $1,234.56" or "Keyword....... 1,234.56"
206
+ pattern = re.compile(rf"{k}[^0-9-]*?(\(?[\d,]+\.?\d*\)?)", re.IGNORECASE)
207
+ match = pattern.search(text_content)
208
+ if match:
209
+ val = PDFParser._clean_value(match.group(1))
210
+ if val is not None:
211
+ data[field] = val
212
+ break
213
+ return data
app/services/ingestion/parser_xlsx.py ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ XLSX Parser - Excel file parsing for financial data.
3
+
4
+ Parses Excel workbooks containing financial statements, handling:
5
+ - Multi-sheet detection (Income Statement, Balance Sheet, Cash Flow)
6
+ - Single-sheet condensed format
7
+ - Various column/row layouts
8
+ """
9
+
10
+ import re
11
+ from typing import Dict, Any, Optional, List
12
+ from datetime import date
13
+
14
+ try:
15
+ import openpyxl
16
+ from openpyxl import load_workbook
17
+ from openpyxl.worksheet.worksheet import Worksheet
18
+ except ImportError:
19
+ openpyxl = None
20
+
21
+ import pandas as pd
22
+
23
+ from app.schemas.financial import (
24
+ FinancialReport,
25
+ BalanceSheetStandard,
26
+ IncomeStatementStandard,
27
+ CashFlowStandard,
28
+ OperatingMetrics,
29
+ PeriodType,
30
+ Currency
31
+ )
32
+ from app.services.ingestion.mappings import DataMapper
33
+
34
+
35
+ class XLSXParser:
36
+ """Parser for Excel (.xlsx, .xls) financial files."""
37
+
38
+ # Keywords to identify sheet types
39
+ INCOME_KEYWORDS = ['income', 'p&l', 'profit', 'loss', 'revenue', 'earnings']
40
+ BALANCE_KEYWORDS = ['balance', 'assets', 'liabilities', 'position']
41
+ CASHFLOW_KEYWORDS = ['cash flow', 'cashflow', 'cash', 'liquidity']
42
+
43
+ @staticmethod
44
+ def parse(file_path: str) -> FinancialReport:
45
+ """
46
+ Parse an Excel file and return a standardized FinancialReport.
47
+
48
+ Handles both multi-sheet and single-sheet formats.
49
+ """
50
+ if openpyxl is None:
51
+ # Fallback to pandas-only parsing
52
+ return XLSXParser._parse_with_pandas(file_path)
53
+
54
+ try:
55
+ wb = load_workbook(file_path, data_only=True)
56
+
57
+ # Categorize sheets
58
+ income_sheet = None
59
+ balance_sheet = None
60
+ cashflow_sheet = None
61
+
62
+ for sheet_name in wb.sheetnames:
63
+ name_lower = sheet_name.lower()
64
+
65
+ if any(kw in name_lower for kw in XLSXParser.INCOME_KEYWORDS):
66
+ income_sheet = wb[sheet_name]
67
+ elif any(kw in name_lower for kw in XLSXParser.BALANCE_KEYWORDS):
68
+ balance_sheet = wb[sheet_name]
69
+ elif any(kw in name_lower for kw in XLSXParser.CASHFLOW_KEYWORDS):
70
+ cashflow_sheet = wb[sheet_name]
71
+
72
+ # If no specialized sheets found, use first sheet for all
73
+ if not income_sheet and not balance_sheet and not cashflow_sheet:
74
+ default_sheet = wb.active
75
+ income_sheet = balance_sheet = cashflow_sheet = default_sheet
76
+
77
+ # Extract data from each sheet
78
+ data_dict = {}
79
+
80
+ if income_sheet:
81
+ data_dict.update(XLSXParser._extract_from_sheet(income_sheet))
82
+ if balance_sheet and balance_sheet != income_sheet:
83
+ data_dict.update(XLSXParser._extract_from_sheet(balance_sheet))
84
+ if cashflow_sheet and cashflow_sheet != income_sheet and cashflow_sheet != balance_sheet:
85
+ data_dict.update(XLSXParser._extract_from_sheet(cashflow_sheet))
86
+
87
+ # If still no data, try pandas fallback
88
+ if not data_dict:
89
+ return XLSXParser._parse_with_pandas(file_path)
90
+
91
+ # Extract company name from filename or first cell
92
+ company_name = XLSXParser._extract_company_name(wb)
93
+
94
+ return XLSXParser._build_report(data_dict, company_name)
95
+
96
+ except Exception as e:
97
+ # Fallback to pandas
98
+ print(f"openpyxl parse failed, falling back to pandas: {e}")
99
+ return XLSXParser._parse_with_pandas(file_path)
100
+
101
+ @staticmethod
102
+ def _extract_from_sheet(sheet: 'Worksheet') -> Dict[str, float]:
103
+ """Extract financial data from a worksheet."""
104
+ data = {}
105
+
106
+ # Try to find the data range
107
+ # Look for rows with label in first column and numeric value in subsequent columns
108
+ for row in sheet.iter_rows(min_row=1, max_row=min(200, sheet.max_row)):
109
+ if not row or not row[0].value:
110
+ continue
111
+
112
+ label = str(row[0].value).strip()
113
+ field = DataMapper.map_row(label)
114
+
115
+ if field:
116
+ # Find the first non-empty numeric value in this row
117
+ for cell in row[1:]:
118
+ if cell.value is not None:
119
+ try:
120
+ val = XLSXParser._clean_value(cell.value)
121
+ if val is not None:
122
+ data[field] = val
123
+ break
124
+ except:
125
+ continue
126
+
127
+ return data
128
+
129
+ @staticmethod
130
+ def _clean_value(val: Any) -> Optional[float]:
131
+ """Clean and convert a cell value to float."""
132
+ if val is None:
133
+ return None
134
+ if isinstance(val, (int, float)):
135
+ return float(val)
136
+ if isinstance(val, str):
137
+ # Remove currency symbols, commas, parentheses for negatives
138
+ cleaned = re.sub(r'[,$]', '', val.strip())
139
+ # Handle (1000) format for negatives
140
+ if cleaned.startswith('(') and cleaned.endswith(')'):
141
+ cleaned = '-' + cleaned[1:-1]
142
+ try:
143
+ return float(cleaned)
144
+ except ValueError:
145
+ return None
146
+ return None
147
+
148
+ @staticmethod
149
+ def _extract_company_name(wb) -> str:
150
+ """Try to extract company name from workbook."""
151
+ # Check first sheet, first few cells
152
+ sheet = wb.active
153
+ for row in sheet.iter_rows(min_row=1, max_row=5, max_col=3):
154
+ for cell in row:
155
+ if cell.value and isinstance(cell.value, str):
156
+ val = cell.value.strip()
157
+ # Skip common headers
158
+ if len(val) > 3 and len(val) < 100:
159
+ lower = val.lower()
160
+ if not any(kw in lower for kw in ['balance', 'income', 'cash', 'statement', 'period', 'date', 'quarter', 'annual']):
161
+ return val
162
+ return "Imported Company"
163
+
164
+ @staticmethod
165
+ def _parse_with_pandas(file_path: str) -> FinancialReport:
166
+ """Fallback parsing using pandas."""
167
+ try:
168
+ # Read all sheets
169
+ xl = pd.ExcelFile(file_path)
170
+ data_dict = {}
171
+
172
+ for sheet_name in xl.sheet_names:
173
+ df = pd.read_excel(xl, sheet_name=sheet_name)
174
+
175
+ if df.empty:
176
+ continue
177
+
178
+ # Try vertical format (label in col 0, value in col 1+)
179
+ if len(df.columns) >= 2:
180
+ for _, row in df.iterrows():
181
+ label = str(row.iloc[0]) if pd.notna(row.iloc[0]) else ""
182
+ field = DataMapper.map_row(label)
183
+ if field:
184
+ # Find first numeric value
185
+ for val in row.iloc[1:]:
186
+ if pd.notna(val):
187
+ try:
188
+ data_dict[field] = float(str(val).replace(',', '').replace('$', ''))
189
+ break
190
+ except:
191
+ continue
192
+
193
+ return XLSXParser._build_report(data_dict, "Imported Company")
194
+
195
+ except Exception as e:
196
+ print(f"Pandas XLSX parse failed: {e}")
197
+ return XLSXParser._build_empty_report()
198
+
199
+ @staticmethod
200
+ def _build_report(data_dict: Dict[str, float], company_name: str) -> FinancialReport:
201
+ """Build FinancialReport from extracted data."""
202
+ def get(key: str, default: float = 0.0) -> float:
203
+ return data_dict.get(key, default)
204
+
205
+ # Computed Income
206
+ revenue = get("revenue")
207
+ cogs = get("cogs")
208
+ marketing = get("marketing_expenses")
209
+ payroll = get("payroll_expenses")
210
+ rent = get("rent_expense")
211
+ other = get("other_operating_expenses")
212
+ depreciation = get("depreciation")
213
+ amortization = get("amortization")
214
+ interest = get("interest_expense")
215
+ taxes = get("taxes")
216
+
217
+ op_expenses = marketing + payroll + rent + other
218
+ gross_profit = revenue - cogs
219
+ ebitda = gross_profit - op_expenses
220
+ op_income = ebitda - depreciation - amortization
221
+ net_income = op_income - interest - taxes
222
+
223
+ income = IncomeStatementStandard(
224
+ revenue=revenue,
225
+ cogs=cogs,
226
+ marketing_expenses=marketing,
227
+ payroll_expenses=payroll,
228
+ rent_expense=rent,
229
+ other_operating_expenses=other,
230
+ depreciation=depreciation,
231
+ amortization=amortization,
232
+ interest_expense=interest,
233
+ taxes=taxes,
234
+ # Computed
235
+ operating_expenses=op_expenses,
236
+ gross_profit=gross_profit,
237
+ ebitda=ebitda,
238
+ operating_income=op_income,
239
+ net_income=net_income
240
+ )
241
+
242
+ # Computed Balance
243
+ cash = get("cash")
244
+ ar = get("accounts_receivable")
245
+ inv = get("inventory")
246
+ prepaid = get("prepaid_expenses")
247
+ ppe = get("property_plant_equipment")
248
+ accum_dep = get("accumulated_depreciation")
249
+ intangibles = get("intangible_assets")
250
+
251
+ ap = get("accounts_payable")
252
+ accrued = get("accrued_liabilities")
253
+ st_debt = get("short_term_debt")
254
+ lt_debt = get("long_term_debt")
255
+ deferred = get("deferred_revenue")
256
+ equity = get("total_equity")
257
+
258
+ bs_current_assets = cash + ar + inv + prepaid
259
+ bs_total_assets = bs_current_assets + (ppe - accum_dep) + intangibles
260
+ bs_current_liab = ap + accrued + st_debt
261
+ bs_total_liab = bs_current_liab + lt_debt + deferred
262
+
263
+ balance = BalanceSheetStandard(
264
+ cash=cash,
265
+ accounts_receivable=ar,
266
+ inventory=inv,
267
+ prepaid_expenses=prepaid,
268
+ property_plant_equipment=ppe,
269
+ accumulated_depreciation=accum_dep,
270
+ intangible_assets=intangibles,
271
+ accounts_payable=ap,
272
+ accrued_liabilities=accrued,
273
+ short_term_debt=st_debt,
274
+ long_term_debt=lt_debt,
275
+ deferred_revenue=deferred,
276
+ total_equity=equity,
277
+ # Computed
278
+ total_current_assets=bs_current_assets,
279
+ total_assets=bs_total_assets,
280
+ total_current_liabilities=bs_current_liab,
281
+ total_liabilities=bs_total_liab
282
+ )
283
+
284
+ cash_flow = CashFlowStandard(
285
+ operating_cash_flow=get("operating_cash_flow"),
286
+ capex=get("capex"),
287
+ investing_cash_flow=get("investing_cash_flow"),
288
+ financing_cash_flow=get("financing_cash_flow")
289
+ )
290
+
291
+ metrics = OperatingMetrics(
292
+ industry='general',
293
+ new_customers=int(get("new_customers")) if get("new_customers") else None,
294
+ total_transactions=int(get("total_transactions")) if get("total_transactions") else None,
295
+ total_seats=int(get("total_seats")) if get("total_seats") else None
296
+ )
297
+
298
+ return FinancialReport(
299
+ company_name=company_name,
300
+ period_end=date.today(),
301
+ period_type=PeriodType.ANNUAL,
302
+ currency=Currency.USD,
303
+ income_statement=income,
304
+ balance_sheet=balance,
305
+ cash_flow=cash_flow,
306
+ metrics=metrics
307
+ )
308
+
309
+ @staticmethod
310
+ def _build_empty_report() -> FinancialReport:
311
+ """Build an empty report as last resort."""
312
+ return XLSXParser._build_report({}, "Unknown Company")
app/services/ingestion/unified_parser.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Unified Parser - Central coordinator for all file format parsing.
3
+
4
+ This module provides a single entry point for parsing any supported
5
+ financial document format (CSV, PDF, XLSX).
6
+ """
7
+
8
+ from typing import Tuple
9
+ from app.schemas.financial import FinancialReport
10
+
11
+
12
+ class UnifiedParser:
13
+ """
14
+ Central parser that routes files to appropriate format-specific parsers.
15
+
16
+ Supported formats:
17
+ - CSV: Comma-separated values
18
+ - PDF: PDF documents (10-K, 10-Q, financial reports)
19
+ - XLSX/XLS: Excel workbooks
20
+ """
21
+
22
+ SUPPORTED_EXTENSIONS = {
23
+ 'csv': 'csv',
24
+ 'pdf': 'pdf',
25
+ 'xlsx': 'xlsx',
26
+ 'xls': 'xlsx', # Route both to XLSX parser
27
+ }
28
+
29
+ @staticmethod
30
+ def get_format(filename: str) -> str:
31
+ """
32
+ Determine file format from filename.
33
+
34
+ Returns: 'csv', 'pdf', 'xlsx', or raises ValueError
35
+ """
36
+ ext = filename.lower().rsplit('.', 1)[-1] if '.' in filename else ''
37
+
38
+ if ext not in UnifiedParser.SUPPORTED_EXTENSIONS:
39
+ raise ValueError(f"Unsupported file format: .{ext}. Supported: .csv, .pdf, .xlsx, .xls")
40
+
41
+ return UnifiedParser.SUPPORTED_EXTENSIONS[ext]
42
+
43
+ @staticmethod
44
+ def parse(file_path: str, filename: str) -> FinancialReport:
45
+ """
46
+ Parse a financial document and return standardized FinancialReport.
47
+
48
+ Args:
49
+ file_path: Path to the saved file on disk
50
+ filename: Original filename (used for format detection)
51
+
52
+ Returns:
53
+ FinancialReport with standardized financial data
54
+
55
+ Raises:
56
+ ValueError: If file format is not supported
57
+ """
58
+ fmt = UnifiedParser.get_format(filename)
59
+
60
+ if fmt == 'csv':
61
+ from app.services.ingestion.parser_csv import CSVParser
62
+ return CSVParser.parse(file_path)
63
+
64
+ elif fmt == 'pdf':
65
+ from app.services.ingestion.parser_dolphin import HybridPDFParser
66
+ return HybridPDFParser.parse(file_path)
67
+
68
+ elif fmt == 'xlsx':
69
+ from app.services.ingestion.parser_xlsx import XLSXParser
70
+ return XLSXParser.parse(file_path)
71
+
72
+ else:
73
+ raise ValueError(f"No parser available for format: {fmt}")
74
+
75
+ @staticmethod
76
+ def is_supported(filename: str) -> bool:
77
+ """Check if a filename has a supported extension."""
78
+ ext = filename.lower().rsplit('.', 1)[-1] if '.' in filename else ''
79
+ return ext in UnifiedParser.SUPPORTED_EXTENSIONS
80
+
81
+ @staticmethod
82
+ def get_supported_extensions() -> list:
83
+ """Return list of supported file extensions."""
84
+ return list(UnifiedParser.SUPPORTED_EXTENSIONS.keys())
app/services/intelligence/ai_cfo.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.schemas.financial import StandardizedDataPackage
2
+ import os
3
+
4
+ class AICFOService:
5
+ @staticmethod
6
+ def generate_executive_summary(data: StandardizedDataPackage) -> str:
7
+ """
8
+ Generates a natural language executive summary using a generative AI model.
9
+ Currently scaffolds the prompt construction and mocks the response if no API key is present.
10
+ """
11
+
12
+ # 1. Construct Context
13
+ company = data.raw_data.company_name
14
+ revenue = data.raw_data.income_statement.revenue
15
+ margin = data.kpis.net_margin
16
+ score = data.risk_analysis.risk_score
17
+
18
+ prompt = f"""
19
+ You are an elite CFO advising the CEO of {company}.
20
+ Financial Snapshot:
21
+ - Annual Revenue: ${revenue:,.2f}
22
+ - Net Margin: {margin:.1f}%
23
+ - Overall Risk Score: {score}/100
24
+ - Top Pain Points: {', '.join([p for p in data.insights if 'Pain' in p])}
25
+
26
+ Write a 3-paragraph executive summary:
27
+ 1. The Good: What is working well?
28
+ 2. The Bad: What are the immediate risks?
29
+ 3. The Ugly: What needs drastic change immediately?
30
+
31
+ Keep it punchy, professional, and actionable.
32
+ """
33
+
34
+ # 2. Call LLM (Placeholder for Gemini)
35
+ # api_key = os.getenv("GEMINI_API_KEY")
36
+ # if api_key:
37
+ # return call_gemini(api_key, prompt)
38
+
39
+ # 3. Mock Response (Fallback)
40
+ return (
41
+ f"## Executive Summary for {company}\n\n"
42
+ "**The Good:**\n"
43
+ f"Your revenue is strong at ${revenue:,.0f}, demonstrating clear market demand. "
44
+ f"A net margin of {margin:.1f}% is respectable, indicating your core unit economics are sound. "
45
+ f"With a Health Score of {data.health_score.total_score}/100, the business foundation is stable.\n\n"
46
+ "**The Bad:**\n"
47
+ f"We detected some potential liquidity friction locally. Your burn rate suggests you might have constrained runway if sales dip. "
48
+ "Optimization of COGS could yield an immediate 2-3% bottom-line improvement.\n\n"
49
+ "**The Ugly:**\n"
50
+ "No catastrophic risks detected immediately, but reliance on a single revenue stream could be a blind spot. "
51
+ "I recommend diversifying customer acquisition channels immediately to safeguard against volatility."
52
+ )
app/services/intelligence/gemini_service.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import requests
4
+ import json
5
+ from dotenv import load_dotenv
6
+ from app.schemas.chat import ChatRequest, ChatResponse
7
+ from app.schemas.financial import StandardizedDataPackage
8
+
9
+ # Load .env file
10
+ load_dotenv()
11
+
12
+ class GeminiService:
13
+ API_KEY = os.getenv("GEMINI_API_KEY")
14
+
15
+ # Model fallback chain - try in order, fall back if quota exceeded
16
+ MODELS = [
17
+ "gemini-3-flash", # Primary - fastest, newest
18
+ "gemini-2.5-flash", # Fallback 1 - stable
19
+ "gemini-2.5-flash-lite", # Fallback 2 - lightweight
20
+ "gemini-2.0-flash", # Fallback 3 - legacy stable
21
+ ]
22
+
23
+ # Track which models have hit quota in this session
24
+ _exhausted_models = set()
25
+
26
+ @classmethod
27
+ def _get_api_url(cls, model_name: str) -> str:
28
+ """Generate API URL for a specific model."""
29
+ return f"https://generativelanguage.googleapis.com/v1beta/models/{model_name}:generateContent?key={cls.API_KEY}"
30
+
31
+ @classmethod
32
+ def _reset_exhausted_models(cls):
33
+ """Reset exhausted models (call periodically or on new day)."""
34
+ cls._exhausted_models.clear()
35
+
36
+ @staticmethod
37
+ def _parse_error_response(status_code: int, response_text: str) -> str:
38
+ """
39
+ Parse API error responses and return clean, user-friendly messages.
40
+ Never expose raw JSON to users.
41
+ """
42
+ if status_code == 429:
43
+ return "AI service is temporarily busy. Please try again in a few moments."
44
+ elif status_code == 401 or status_code == 403:
45
+ return "AI service authentication failed. Please check your API key configuration."
46
+ elif status_code == 400:
47
+ return "Invalid request to AI service. Please try a simpler query."
48
+ elif status_code == 500:
49
+ return "AI service is experiencing issues. Please try again later."
50
+ elif status_code == 503:
51
+ return "AI service is temporarily unavailable. Please try again later."
52
+ else:
53
+ return f"AI service returned an unexpected error (Code: {status_code}). Please try again."
54
+
55
+ @classmethod
56
+ def _try_request(cls, payload: dict, timeout: int = 30) -> tuple[bool, str, str]:
57
+ """
58
+ Try to make a request using available models with automatic fallback.
59
+ Returns: (success: bool, response_text: str, model_used: str)
60
+ """
61
+ if not cls.API_KEY:
62
+ return False, "Gemini API Key is missing. Please configure GEMINI_API_KEY.", ""
63
+
64
+ headers = {"Content-Type": "application/json"}
65
+ last_error = ""
66
+
67
+ for model in cls.MODELS:
68
+ # Skip models that have hit their quota this session
69
+ if model in cls._exhausted_models:
70
+ continue
71
+
72
+ try:
73
+ api_url = cls._get_api_url(model)
74
+ response = requests.post(api_url, headers=headers, json=payload, timeout=timeout)
75
+
76
+ if response.status_code == 200:
77
+ result = response.json()
78
+ try:
79
+ text = result['candidates'][0]['content']['parts'][0]['text']
80
+ return True, text, model
81
+ except (KeyError, IndexError):
82
+ last_error = "AI generated empty response."
83
+ continue
84
+
85
+ elif response.status_code == 429:
86
+ # Model quota exceeded - mark as exhausted and try next
87
+ cls._exhausted_models.add(model)
88
+ print(f"Model {model} quota exceeded, trying next model...")
89
+ last_error = "All AI models are currently at capacity."
90
+ continue
91
+
92
+ else:
93
+ # Other error - try next model
94
+ last_error = cls._parse_error_response(response.status_code, response.text)
95
+ continue
96
+
97
+ except requests.exceptions.Timeout:
98
+ last_error = "AI service timed out."
99
+ continue
100
+ except requests.exceptions.ConnectionError:
101
+ last_error = "Unable to connect to AI service."
102
+ continue
103
+ except Exception as e:
104
+ last_error = "An unexpected error occurred."
105
+ continue
106
+
107
+ # All models exhausted
108
+ return False, last_error, ""
109
+
110
+ @classmethod
111
+ def query(cls, request: ChatRequest, context_data: StandardizedDataPackage) -> ChatResponse:
112
+ if not cls.API_KEY:
113
+ return ChatResponse(response="Gemini API Key is missing. Please configure GEMINI_API_KEY in the backend.")
114
+
115
+ # Construct Prompt with Financial Context
116
+ system_prompt = f"""
117
+ You are Visique, an expert AI CFO. You are analyzing the financial data for {context_data.raw_data.company_name}.
118
+
119
+ Financial Context:
120
+ - Revenue: {context_data.raw_data.income_statement.revenue} {context_data.raw_data.currency}
121
+ - Net Income: {context_data.raw_data.income_statement.net_income}
122
+ - Cash Balance: {context_data.raw_data.balance_sheet.cash}
123
+ - Health Score: {context_data.health_score.total_score}/100
124
+
125
+ Key Insights:
126
+ {json.dumps(context_data.insights, indent=2)}
127
+
128
+ Optimization Insights (Heatmap/Dead Zones):
129
+ {json.dumps([z for z in context_data.optimization_insights.dead_zones] if context_data.optimization_insights else [], indent=2)}
130
+
131
+ User Question: {request.message}
132
+
133
+ Answer concisely as a CFO. If the user asks about "Dynamic Promos" or "Optimization", refer to the Dead Zones data.
134
+ """
135
+
136
+ payload = {
137
+ "contents": [{
138
+ "parts": [{"text": system_prompt}]
139
+ }]
140
+ }
141
+
142
+ success, response_text, model_used = cls._try_request(payload)
143
+
144
+ if success:
145
+ return ChatResponse(response=response_text)
146
+ else:
147
+ return ChatResponse(response=response_text)
148
+
149
+ @classmethod
150
+ def generate_content(cls, prompt: str) -> str:
151
+ """
152
+ Generic generator for internal services (like GeoService).
153
+ Uses automatic model fallback. Returns clean, presentable text.
154
+ """
155
+ if not cls.API_KEY:
156
+ return "Strategic insights require AI configuration. Contact support for assistance."
157
+
158
+ payload = {
159
+ "contents": [{
160
+ "parts": [{"text": prompt}]
161
+ }]
162
+ }
163
+
164
+ success, response_text, model_used = cls._try_request(payload)
165
+
166
+ if success:
167
+ return response_text
168
+ else:
169
+ # Return intelligent fallback content instead of error
170
+ return cls._get_fallback_content(prompt)
171
+
172
+ @staticmethod
173
+ def _get_fallback_content(prompt: str) -> str:
174
+ """
175
+ Provide meaningful fallback content when ALL AI models are unavailable.
176
+ This ensures reports and displays never show error messages.
177
+ """
178
+ prompt_lower = prompt.lower()
179
+
180
+ if "competitor" in prompt_lower or "landscape" in prompt_lower:
181
+ return """**Market Analysis**
182
+
183
+ Based on industry standards for your sector:
184
+
185
+ • **Primary Competition**: Focus on businesses within a 5-mile radius offering similar services
186
+ • **Traffic Patterns**: Peak hours typically align with lunch (11am-2pm) and evening (5pm-8pm) periods
187
+ • **Differentiation**: Evaluate unique value propositions against local alternatives
188
+
189
+ *AI-powered real-time analysis available when capacity permits.*"""
190
+
191
+ elif "strategic" in prompt_lower or "context" in prompt_lower:
192
+ return """**Strategic Context Overview**
193
+
194
+ Key considerations for your market:
195
+
196
+ • **Regulatory Environment**: Stay current with local business regulations and licensing requirements
197
+ • **Economic Indicators**: Monitor regional employment and consumer spending trends
198
+ • **Industry Outlook**: Your sector shows stable fundamentals with growth potential
199
+
200
+ *Enhanced AI insights will be available shortly.*"""
201
+
202
+ elif "marketing" in prompt_lower or "growth" in prompt_lower:
203
+ return """**Growth Strategy Framework**
204
+
205
+ Recommended focus areas for sustainable growth:
206
+
207
+ • **Digital Presence**: Optimize Google Business Profile and local SEO
208
+ • **Customer Retention**: Implement loyalty programs to increase lifetime value
209
+ • **Community Engagement**: Partner with local organizations for visibility
210
+
211
+ *AI-powered personalized recommendations available when capacity permits.*"""
212
+
213
+ else:
214
+ return """**Analysis Summary**
215
+
216
+ Your financial data has been processed successfully. Key takeaways:
217
+
218
+ • Review the health score breakdown for areas of strength and improvement
219
+ • Monitor cash runway projections for operational planning
220
+ • Consider the recommendations provided for optimization opportunities
221
+
222
+ *For deeper AI-driven insights, please try again in a few minutes.*"""
223
+
224
+ @classmethod
225
+ def get_model_status(cls) -> dict:
226
+ """
227
+ Get current status of available models (for debugging/admin).
228
+ """
229
+ available_models = [m for m in cls.MODELS if m not in cls._exhausted_models]
230
+ exhausted = list(cls._exhausted_models)
231
+
232
+ return {
233
+ "total_models": len(cls.MODELS),
234
+ "available_models": available_models,
235
+ "exhausted_models": exhausted,
236
+ "all_exhausted": len(available_models) == 0
237
+ }
238
+
app/services/intelligence/geo_service.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import random
3
+
4
+ class GeoService:
5
+ @staticmethod
6
+ def analyze_location(address: str, industry: str = "General", is_own_company: bool = False, company_name: str = ""):
7
+ """
8
+ Generates strategic analysis using Google Gemini if available,
9
+ otherwise falls back to simulation.
10
+
11
+ :param address: The address to analyze
12
+ :param industry: The industry type
13
+ :param is_own_company: Whether this is the user's own company (enables more personalized insights)
14
+ :param company_name: Name of the company being analyzed
15
+ """
16
+ from app.services.intelligence.gemini_service import GeminiService
17
+
18
+ context_prefix = f"for {company_name}" if company_name else ""
19
+ personalization = "your business" if is_own_company else f"this {industry} business"
20
+
21
+ # Check for Real AI Capability
22
+ if GeminiService.API_KEY:
23
+ try:
24
+ # 1. Competitor Landscape
25
+ p1 = f"Analyze the competitor landscape {context_prefix} for a {industry} business located at {address}. {'As the owner, provide actionable competitive intelligence.' if is_own_company else 'Provide general market context.'} Identify 3 competitors and describe the traffic patterns in the area. Limit to 150 words. Format with **Bold** headers."
26
+ comp_summary = GeminiService.generate_content(p1)
27
+
28
+ # 2. Strategic Context
29
+ p2 = f"Provide a brief strategic context analysis for {address} regarding local regulations, news events, and economic sentiment for the {industry} sector {context_prefix}. {'Include specific recommendations for the owner.' if is_own_company else ''} Limit to 150 words."
30
+ context_summary = GeminiService.generate_content(p2)
31
+
32
+ # 3. Marketing Strategy
33
+ p3 = f"Suggest a growth and marketing strategy for {personalization} at {address}. {'Be specific with actionable next steps for the owner to implement.' if is_own_company else 'Provide general market positioning advice.'} Include digital positioning advice and 2 actionable recommendations. Limit to 150 words."
34
+ marketing_summary = GeminiService.generate_content(p3)
35
+
36
+ return {
37
+ "competitor_analysis": comp_summary,
38
+ "strategic_context": context_summary,
39
+ "marketing_strategy": marketing_summary
40
+ }
41
+ except Exception as e:
42
+ print(f"Gemini Generation Failed: {e}. Falling back to simulation.")
43
+ # Fallthrough to default logic below
44
+
45
+ # ... FALLBACK MOCK DATA ...
46
+ # Mocking external data capabilities
47
+ competitors = [
48
+ "Alpha Competitor Inc.", "Beta Rivals LLC", "Local Market Leader"
49
+ ] if industry != "Restaurant" else [
50
+ "The Hungry Chef", "Burger King", "Downtown Bistro"
51
+ ]
52
+
53
+ ownership_note = "As the owner of this business," if is_own_company else "For this business,"
54
+ company_ref = company_name if company_name else "the business"
55
+
56
+ # 1. Competitor & Location Analysis (Page 1 content)
57
+ comp_summary = f"""
58
+ **Location Analysis for:** {address}
59
+ **Company:** {company_ref}
60
+ **Industry Focus:** {industry}
61
+
62
+ **Competitor Landscape:**
63
+ {ownership_note} we have identified {len(competitors)} primary competitors within a 5-mile radius:
64
+ {', '.join(competitors)}.
65
+
66
+ **Traffic Patterns:**
67
+ Based on historical data, the highest foot traffic in your area occurs between 11:00 AM and 2:00 PM on weekdays.
68
+
69
+ **Site Accessibility:**
70
+ Your location has a Walk Score of {random.randint(40, 95)}/100 and Transit Score of {random.randint(30, 80)}/100.
71
+ """
72
+
73
+ # 2. Political & Local News Context (Page 2 content)
74
+ context_summary = f"""
75
+ **Strategic Context: Local & Political Landscape**
76
+
77
+ **Regulatory Updates:**
78
+ Recent city council proceedings indicate a favorable shift for {industry} businesses.
79
+
80
+ **Economic Sentiment:**
81
+ Local consumer sentiment is currently 'Optimistic' with a spending index of {random.randint(90, 110)}.
82
+
83
+ {"**Owner Action Item:** Engage with local business association for networking opportunities." if is_own_company else ""}
84
+ """
85
+
86
+ # 3. Marketing & Growth Opportunities (Page 3 content)
87
+ marketing_summary = f"""
88
+ **Growth & Marketing Strategy for {company_ref}**
89
+
90
+ **Key Marketing Events:**
91
+ Leverage upcoming local opportunities like the Annual City Festival.
92
+
93
+ **Actionable Recommendations:**
94
+ 1. **Hyper-Local SEO:** {"Optimize your" if is_own_company else "Optimize the"} Google Business Profile for '{company_ref}'.
95
+ 2. **Community Partnerships:** Engage with local news events and neighborhood associations.
96
+ {"3. **Owner Priority:** Focus on building customer reviews - aim for 50+ 5-star reviews." if is_own_company else ""}
97
+ """
98
+
99
+ return {
100
+ "competitor_analysis": comp_summary,
101
+ "strategic_context": context_summary,
102
+ "marketing_strategy": marketing_summary
103
+ }
104
+