import uvicorn from fastapi import FastAPI, Header, HTTPException from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel import base64 import io import os import numpy as np import torch import torch.nn as nn import torch.nn.functional as F import librosa import scipy.signal as signal from transformers import Wav2Vec2Model from huggingface_hub import hf_hub_download # ========================================== # 1. CONFIGURATION # ========================================== REPO_ID = "TaterTots123/human.ai" FILENAME = "final.pth" # Get API Key from Environment Variable SECRET_KEY = os.getenv("API_KEY") if not SECRET_KEY: print("⚠️ WARNING: API_KEY not found in environment variables!") DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"⬇️ Downloading model from {REPO_ID}...") try: MODEL_PATH = hf_hub_download(repo_id=REPO_ID, filename=FILENAME) print(f"✅ Model downloaded to: {MODEL_PATH}") except Exception as e: print(f"❌ Failed to download model: {e}") # ========================================== # 2. MODEL ARCHITECTURE # ========================================== class AASIST_Backend(nn.Module): def __init__(self, in_channels=128, emb_dim=128): super().__init__() self.conv1 = nn.Conv2d(in_channels, 64, kernel_size=(3, 1), padding=(1, 0)) self.conv2 = nn.Conv2d(64, 128, kernel_size=(3, 1), padding=(1, 0)) self.bn1 = nn.BatchNorm2d(64) self.bn2 = nn.BatchNorm2d(128) self.attention = nn.Sequential( nn.Linear(128, 64), nn.Tanh(), nn.Linear(64, 1) ) self.fc = nn.Linear(128, emb_dim) self.classifier = nn.Linear(emb_dim, 2) def forward(self, x): x = F.relu(self.bn1(self.conv1(x))) x = F.relu(self.bn2(self.conv2(x))) x = x.squeeze(-1).transpose(1, 2) w = torch.softmax(self.attention(x), dim=1) x = torch.sum(w * x, dim=1) x = F.relu(self.fc(x)) return self.classifier(x) class VoiceDetector(nn.Module): def __init__(self): super().__init__() print("🌍 Initializing MMS-300M Backbone...") self.backbone = Wav2Vec2Model.from_pretrained("facebook/mms-300m") self.proj = nn.Linear(1024, 128) self.backend = AASIST_Backend() def forward(self, wav_input): with torch.no_grad(): outputs = self.backbone(wav_input) features = outputs.last_hidden_state x = self.proj(features).transpose(1, 2).unsqueeze(-1) return self.backend(x) # ========================================== # 3. SERVER SETUP # ========================================== app = FastAPI() app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) print(f"⏳ Loading Model onto {DEVICE}...") model = VoiceDetector().to(DEVICE) try: state_dict = torch.load(MODEL_PATH, map_location=DEVICE) model.load_state_dict(state_dict) model.eval() print("✅ Model Loaded Successfully!") except Exception as e: print(f"❌ Error loading model: {e}") # ========================================== # 4. PREPROCESSING HELPER # ========================================== def preprocess_tri_series(wav, sr=16000): """ Applies: Bandpass Filter (70Hz-8kHz) -> Z-Score Normalization (Resampling is handled during load) """ # 1. Bandpass Filter (70Hz - 7999Hz) # Removes low rumble and high aliasing noise if len(wav) > 0: sos = signal.butter(6, [70, 7999], btype='bandpass', fs=sr, output='sos') wav = signal.sosfilt(sos, wav) # 2. Z-Score Normalization # Standardizes amplitude if len(wav) > 0: mean = np.mean(wav) std = np.std(wav) + 1e-9 wav = (wav - mean) / std return wav # ========================================== # 5. API ENDPOINTS # ========================================== @app.get("/") def home(): return { "status": "online", "message": "Voice Detection API is live. Accepts .wav and .mp3 only." } class AudioRequest(BaseModel): audioBase64: str @app.post("/api/voice-detection") async def detect_voice(req: AudioRequest, x_api_key: str = Header(None)): # 1. Validate API Key if x_api_key != SECRET_KEY: raise HTTPException(status_code=401, detail="Invalid API Key") try: # 2. Decode Base64 try: if "base64," in req.audioBase64: req.audioBase64 = req.audioBase64.split("base64,")[1] audio_bytes = base64.b64decode(req.audioBase64) except Exception: return {"status": "error", "message": "Invalid Base64 string"} # 3. Direct Load (WAV/MP3 only) # Librosa handles loading directly from memory bytes try: wav, sr = librosa.load(io.BytesIO(audio_bytes), sr=16000, mono=True) except Exception as e: print(f"Audio Load Error: {e}") return {"status": "error", "message": "Invalid audio format. Please use WAV or MP3."} # 4. Apply Tri-Series Preprocessing wav = preprocess_tri_series(wav) # 5. Run 2-Way TTA (Original + Noisy) # --- Pass 1: Original --- tensor_orig = torch.tensor(wav, dtype=torch.float32).unsqueeze(0).to(DEVICE) with torch.no_grad(): logits_orig = model(tensor_orig) probs_orig = torch.softmax(logits_orig, dim=1) score_orig = probs_orig[0][1].item() # Index 1 = AI Score # --- Pass 2: Noisy (Robustness) --- noise = np.random.normal(0, 0.005, wav.shape) wav_noisy = wav + noise tensor_noisy = torch.tensor(wav_noisy, dtype=torch.float32).unsqueeze(0).to(DEVICE) with torch.no_grad(): logits_noisy = model(tensor_noisy) probs_noisy = torch.softmax(logits_noisy, dim=1) score_noisy = probs_noisy[0][1].item() # 6. Average Results final_ai_score = (score_orig + score_noisy) / 2.0 final_human_score = 1.0 - final_ai_score # 7. Classification Logic is_ai = final_ai_score > 0.50 classification = "AI_GENERATED" if is_ai else "HUMAN" confidence = final_ai_score if is_ai else final_human_score # 8. Return Strictly Filtered Response return { "status": "success", "classification": classification, "confidenceScore": round(confidence, 4) } except Exception as e: print(f"Global Error: {e}") return {"status": "error", "message": str(e)} if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=7860)