Commit ·
3bf7b2c
1
Parent(s): 1caea61
style: format code
Browse files- .env.dist +7 -11
- .gitignore +2 -0
- notebooks/test_agent.ipynb +25 -43
- pyproject.toml +2 -1
- requirements.txt +1 -0
- src/paper_survey_agent/agent.py +2 -4
- src/paper_survey_agent/apis/__init__.py +1 -0
- src/paper_survey_agent/apis/arxiv.py +1 -6
- src/paper_survey_agent/apis/semantic_scholar.py +2 -5
- src/paper_survey_agent/llm/client.py +1 -4
- src/paper_survey_agent/models/__init__.py +1 -0
- src/paper_survey_agent/settings.py +0 -1
- src/paper_survey_agent/tools/generate_search_query/generate_search_query.py +1 -4
- src/paper_survey_agent/tools/search_and_load_papers_txt/search_and_load_papers_txt.py +3 -5
- src/paper_survey_agent/tools/search_and_load_papers_txt/utils/downloader.py +1 -4
- src/paper_survey_agent/tools/search_and_load_papers_txt/utils/ranking.py +3 -8
- src/paper_survey_agent/tools/search_and_load_papers_txt/utils/retrieval.py +5 -9
- src/paper_survey_agent/tools/search_and_load_papers_txt/utils/text_extractor.py +1 -4
- src/paper_survey_agent/tools/summarize_papers/summarize_papers.py +1 -4
- src/paper_survey_agent/tools/synthesize_survey/synthesize_survey.py +2 -4
- uv.lock +0 -0
.env.dist
CHANGED
|
@@ -1,18 +1,14 @@
|
|
| 1 |
-
#
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
MAX_TOKENS=4000
|
| 8 |
|
| 9 |
# Search Configuration
|
| 10 |
-
|
| 11 |
MAX_RESULTS_PER_SOURCE=20
|
| 12 |
SEARCH_TIMEOUT=300
|
| 13 |
|
| 14 |
# Semantic Scholar API (optional, but recommended for higher rate limits)
|
| 15 |
SEMANTIC_SCHOLAR_API_KEY=your-semantic-scholar-api-key-here
|
| 16 |
-
|
| 17 |
-
# Logging
|
| 18 |
-
LOG_LEVEL=INFO
|
|
|
|
| 1 |
+
# LLM Configuration
|
| 2 |
+
LLM_PROVIDER=openrouter
|
| 3 |
+
LLM_API_KEY=
|
| 4 |
+
LLM_MODEL=google/gemini-2.0-flash-exp:free
|
| 5 |
+
LLM_TEMPERATURE=0.7
|
| 6 |
+
LLM_MAX_TOKENS=4000
|
|
|
|
| 7 |
|
| 8 |
# Search Configuration
|
| 9 |
+
MAX_PAPERS_TO_RETURN=10
|
| 10 |
MAX_RESULTS_PER_SOURCE=20
|
| 11 |
SEARCH_TIMEOUT=300
|
| 12 |
|
| 13 |
# Semantic Scholar API (optional, but recommended for higher rate limits)
|
| 14 |
SEMANTIC_SCHOLAR_API_KEY=your-semantic-scholar-api-key-here
|
|
|
|
|
|
|
|
|
.gitignore
CHANGED
|
@@ -31,6 +31,8 @@ MANIFEST
|
|
| 31 |
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 32 |
*.manifest
|
| 33 |
*.spec
|
|
|
|
|
|
|
| 34 |
|
| 35 |
# Installer logs
|
| 36 |
pip-log.txt
|
|
|
|
| 31 |
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 32 |
*.manifest
|
| 33 |
*.spec
|
| 34 |
+
.env*
|
| 35 |
+
!.env.dist
|
| 36 |
|
| 37 |
# Installer logs
|
| 38 |
pip-log.txt
|
notebooks/test_agent.ipynb
CHANGED
|
@@ -53,14 +53,16 @@
|
|
| 53 |
],
|
| 54 |
"source": [
|
| 55 |
"import asyncio\n",
|
| 56 |
-
"import sys\n",
|
| 57 |
"from pathlib import Path\n",
|
|
|
|
|
|
|
| 58 |
"\n",
|
| 59 |
"# Додаємо src до path\n",
|
| 60 |
-
"sys.path.insert(0, str(Path.cwd().parent /
|
| 61 |
"\n",
|
| 62 |
-
"from paper_survey_agent.tools import retrieve_papers, rank_and_deduplicate\n",
|
| 63 |
"from paper_survey_agent.models.paper import Paper\n",
|
|
|
|
|
|
|
| 64 |
"\n",
|
| 65 |
"print(\"✅ Модулі успішно імпортовано!\")"
|
| 66 |
]
|
|
@@ -99,11 +101,7 @@
|
|
| 99 |
"query = \"transformer models\"\n",
|
| 100 |
"print(f\"🔍 Шукаємо статті за запитом: '{query}'...\\n\")\n",
|
| 101 |
"\n",
|
| 102 |
-
"papers = await retrieve_papers(\n",
|
| 103 |
-
" query=query,\n",
|
| 104 |
-
" sources=[\"arxiv\", \"semantic_scholar\"],\n",
|
| 105 |
-
" max_results_per_source=10\n",
|
| 106 |
-
")\n",
|
| 107 |
"\n",
|
| 108 |
"print(f\"\\n✅ Знайдено {len(papers)} статей\")"
|
| 109 |
]
|
|
@@ -209,12 +207,7 @@
|
|
| 209 |
"source": [
|
| 210 |
"print(f\"🔄 Ранжуємо {len(papers)} статей...\\n\")\n",
|
| 211 |
"\n",
|
| 212 |
-
"ranked_papers = rank_and_deduplicate(\n",
|
| 213 |
-
" papers=papers,\n",
|
| 214 |
-
" topic=query,\n",
|
| 215 |
-
" top_k=10,\n",
|
| 216 |
-
" fuzzy_threshold=85\n",
|
| 217 |
-
")\n",
|
| 218 |
"\n",
|
| 219 |
"print(f\"\\n✅ Топ-{len(ranked_papers)} найрелевантніших статей (після дедуплікації)\")"
|
| 220 |
]
|
|
@@ -648,22 +641,14 @@
|
|
| 648 |
"query2 = \"attention mechanisms in neural networks\"\n",
|
| 649 |
"print(f\"🔍 Шукаємо статті за запитом: '{query2}'...\\n\")\n",
|
| 650 |
"\n",
|
| 651 |
-
"papers2 = await retrieve_papers(\n",
|
| 652 |
-
" query=query2,\n",
|
| 653 |
-
" sources=[\"arxiv\", \"semantic_scholar\"],\n",
|
| 654 |
-
" max_results_per_source=8\n",
|
| 655 |
-
")\n",
|
| 656 |
"\n",
|
| 657 |
"print(f\"\\n✅ Знайдено {len(papers2)} статей\")\n",
|
| 658 |
"\n",
|
| 659 |
"# Ранжування\n",
|
| 660 |
-
"ranked_papers2 = rank_and_deduplicate(\n",
|
| 661 |
-
" papers=papers2,\n",
|
| 662 |
-
" topic=query2,\n",
|
| 663 |
-
" top_k=5\n",
|
| 664 |
-
")\n",
|
| 665 |
"\n",
|
| 666 |
-
"print(
|
| 667 |
"for i, paper in enumerate(ranked_papers2, 1):\n",
|
| 668 |
" print(f\"{i}. {paper.title}\")\n",
|
| 669 |
" print(f\" 📅 {paper.published_date} | 📖 {paper.citations_count or 'N/A'} цитувань\")\n",
|
|
@@ -717,6 +702,7 @@
|
|
| 717 |
"from collections import Counter\n",
|
| 718 |
"from datetime import datetime\n",
|
| 719 |
"\n",
|
|
|
|
| 720 |
"print(\"📊 Статистика знайдених статей:\\n\")\n",
|
| 721 |
"print(f\"Всього знайдено: {len(papers)}\")\n",
|
| 722 |
"print(f\"Після дедуплікації: {len(ranked_papers)}\")\n",
|
|
@@ -724,14 +710,14 @@
|
|
| 724 |
"\n",
|
| 725 |
"# Статистика по джерелам\n",
|
| 726 |
"sources = Counter(p.source for p in papers)\n",
|
| 727 |
-
"print(
|
| 728 |
"for source, count in sources.items():\n",
|
| 729 |
" print(f\" - {source}: {count}\")\n",
|
| 730 |
"\n",
|
| 731 |
"# Статистика по рокам\n",
|
| 732 |
"years = [p.published_date.year for p in ranked_papers]\n",
|
| 733 |
"year_counts = Counter(years)\n",
|
| 734 |
-
"print(
|
| 735 |
"for year, count in sorted(year_counts.items(), reverse=True):\n",
|
| 736 |
" print(f\" - {year}: {count}\")\n",
|
| 737 |
"\n",
|
|
@@ -745,7 +731,7 @@
|
|
| 745 |
"\n",
|
| 746 |
"# Наявність PDF\n",
|
| 747 |
"with_pdf = sum(1 for p in ranked_papers if p.pdf_url)\n",
|
| 748 |
-
"print(f\"\\nСтатей з PDF: {with_pdf}/{len(ranked_papers)} ({with_pdf/len(ranked_papers)*100:.1f}%)\")"
|
| 749 |
]
|
| 750 |
},
|
| 751 |
{
|
|
@@ -885,39 +871,35 @@
|
|
| 885 |
"test_queries = [\n",
|
| 886 |
" (\"machine learning\", \"Популярна тема (очікуємо 10 статей)\"),\n",
|
| 887 |
" (\"lemon juice\", \"Рідкісна тема (можливо менше 10)\"),\n",
|
| 888 |
-
" (\"zzxxyywwqqppvvkkjjhhggffddssaammnnbbccll123456789\", \"Абсолютно випадковий набір (очікуємо 0 → ValueError)\")
|
| 889 |
"]\n",
|
| 890 |
"\n",
|
| 891 |
"for query, description in test_queries:\n",
|
| 892 |
" print(f\"\\n📝 Запит: '{query}'\")\n",
|
| 893 |
" print(f\" Опис: {description}\")\n",
|
| 894 |
" print(\"-\" * 80)\n",
|
| 895 |
-
"
|
| 896 |
" try:\n",
|
| 897 |
-
" papers_test = await retrieve_papers(\n",
|
| 898 |
-
"
|
| 899 |
-
" sources=[\"arxiv\"],\n",
|
| 900 |
-
" max_results_per_source=10\n",
|
| 901 |
-
" )\n",
|
| 902 |
-
" \n",
|
| 903 |
" found = len(papers_test)\n",
|
| 904 |
-
"
|
| 905 |
" if found == 10:\n",
|
| 906 |
" print(f\" ✅ Знайдено {found} статей з PDF — рівно стільки скільки просили!\")\n",
|
| 907 |
" elif found > 0:\n",
|
| 908 |
" print(f\" ⚠️ Знайдено {found} статей з PDF (менше ніж 10)\")\n",
|
| 909 |
-
" print(
|
| 910 |
-
"
|
| 911 |
" # Показуємо перші 2 статті якщо є\n",
|
| 912 |
" if found > 0:\n",
|
| 913 |
-
" print(
|
| 914 |
" for i, paper in enumerate(papers_test[:2], 1):\n",
|
| 915 |
" print(f\" {i}. {paper.title[:70]}...\")\n",
|
| 916 |
-
"
|
| 917 |
" except ValueError as e:\n",
|
| 918 |
" print(f\" ❌ ValueError: {e}\")\n",
|
| 919 |
-
" print(
|
| 920 |
-
"
|
| 921 |
" print()"
|
| 922 |
]
|
| 923 |
}
|
|
|
|
| 53 |
],
|
| 54 |
"source": [
|
| 55 |
"import asyncio\n",
|
|
|
|
| 56 |
"from pathlib import Path\n",
|
| 57 |
+
"import sys\n",
|
| 58 |
+
"\n",
|
| 59 |
"\n",
|
| 60 |
"# Додаємо src до path\n",
|
| 61 |
+
"sys.path.insert(0, str(Path.cwd().parent / \"src\"))\n",
|
| 62 |
"\n",
|
|
|
|
| 63 |
"from paper_survey_agent.models.paper import Paper\n",
|
| 64 |
+
"from paper_survey_agent.tools import rank_and_deduplicate, retrieve_papers\n",
|
| 65 |
+
"\n",
|
| 66 |
"\n",
|
| 67 |
"print(\"✅ Модулі успішно імпортовано!\")"
|
| 68 |
]
|
|
|
|
| 101 |
"query = \"transformer models\"\n",
|
| 102 |
"print(f\"🔍 Шукаємо статті за запитом: '{query}'...\\n\")\n",
|
| 103 |
"\n",
|
| 104 |
+
"papers = await retrieve_papers(query=query, sources=[\"arxiv\", \"semantic_scholar\"], max_results_per_source=10)\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
"\n",
|
| 106 |
"print(f\"\\n✅ Знайдено {len(papers)} статей\")"
|
| 107 |
]
|
|
|
|
| 207 |
"source": [
|
| 208 |
"print(f\"🔄 Ранжуємо {len(papers)} статей...\\n\")\n",
|
| 209 |
"\n",
|
| 210 |
+
"ranked_papers = rank_and_deduplicate(papers=papers, topic=query, top_k=10, fuzzy_threshold=85)\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
"\n",
|
| 212 |
"print(f\"\\n✅ Топ-{len(ranked_papers)} найрелевантніших статей (після дедуплікації)\")"
|
| 213 |
]
|
|
|
|
| 641 |
"query2 = \"attention mechanisms in neural networks\"\n",
|
| 642 |
"print(f\"🔍 Шукаємо статті за запитом: '{query2}'...\\n\")\n",
|
| 643 |
"\n",
|
| 644 |
+
"papers2 = await retrieve_papers(query=query2, sources=[\"arxiv\", \"semantic_scholar\"], max_results_per_source=8)\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 645 |
"\n",
|
| 646 |
"print(f\"\\n✅ Знайдено {len(papers2)} статей\")\n",
|
| 647 |
"\n",
|
| 648 |
"# Ранжування\n",
|
| 649 |
+
"ranked_papers2 = rank_and_deduplicate(papers=papers2, topic=query2, top_k=5)\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 650 |
"\n",
|
| 651 |
+
"print(\"\\n🏆 Топ-5 після ранжування:\\n\")\n",
|
| 652 |
"for i, paper in enumerate(ranked_papers2, 1):\n",
|
| 653 |
" print(f\"{i}. {paper.title}\")\n",
|
| 654 |
" print(f\" 📅 {paper.published_date} | 📖 {paper.citations_count or 'N/A'} цитувань\")\n",
|
|
|
|
| 702 |
"from collections import Counter\n",
|
| 703 |
"from datetime import datetime\n",
|
| 704 |
"\n",
|
| 705 |
+
"\n",
|
| 706 |
"print(\"📊 Статистика знайдених статей:\\n\")\n",
|
| 707 |
"print(f\"Всього знайдено: {len(papers)}\")\n",
|
| 708 |
"print(f\"Після дедуплікації: {len(ranked_papers)}\")\n",
|
|
|
|
| 710 |
"\n",
|
| 711 |
"# Статистика по джерелам\n",
|
| 712 |
"sources = Counter(p.source for p in papers)\n",
|
| 713 |
+
"print(\"\\nПо джерелам:\")\n",
|
| 714 |
"for source, count in sources.items():\n",
|
| 715 |
" print(f\" - {source}: {count}\")\n",
|
| 716 |
"\n",
|
| 717 |
"# Статистика по рокам\n",
|
| 718 |
"years = [p.published_date.year for p in ranked_papers]\n",
|
| 719 |
"year_counts = Counter(years)\n",
|
| 720 |
+
"print(\"\\nПо рокам публікації (топ-10):\")\n",
|
| 721 |
"for year, count in sorted(year_counts.items(), reverse=True):\n",
|
| 722 |
" print(f\" - {year}: {count}\")\n",
|
| 723 |
"\n",
|
|
|
|
| 731 |
"\n",
|
| 732 |
"# Наявність PDF\n",
|
| 733 |
"with_pdf = sum(1 for p in ranked_papers if p.pdf_url)\n",
|
| 734 |
+
"print(f\"\\nСтатей з PDF: {with_pdf}/{len(ranked_papers)} ({with_pdf / len(ranked_papers) * 100:.1f}%)\")"
|
| 735 |
]
|
| 736 |
},
|
| 737 |
{
|
|
|
|
| 871 |
"test_queries = [\n",
|
| 872 |
" (\"machine learning\", \"Популярна тема (очікуємо 10 статей)\"),\n",
|
| 873 |
" (\"lemon juice\", \"Рідкісна тема (можливо менше 10)\"),\n",
|
| 874 |
+
" (\"zzxxyywwqqppvvkkjjhhggffddssaammnnbbccll123456789\", \"Абсолютно випадковий набір (очікуємо 0 → ValueError)\"),\n",
|
| 875 |
"]\n",
|
| 876 |
"\n",
|
| 877 |
"for query, description in test_queries:\n",
|
| 878 |
" print(f\"\\n📝 Запит: '{query}'\")\n",
|
| 879 |
" print(f\" Опис: {description}\")\n",
|
| 880 |
" print(\"-\" * 80)\n",
|
| 881 |
+
"\n",
|
| 882 |
" try:\n",
|
| 883 |
+
" papers_test = await retrieve_papers(query=query, sources=[\"arxiv\"], max_results_per_source=10)\n",
|
| 884 |
+
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 885 |
" found = len(papers_test)\n",
|
| 886 |
+
"\n",
|
| 887 |
" if found == 10:\n",
|
| 888 |
" print(f\" ✅ Знайдено {found} статей з PDF — рівно стільки скільки просили!\")\n",
|
| 889 |
" elif found > 0:\n",
|
| 890 |
" print(f\" ⚠️ Знайдено {found} статей з PDF (менше ніж 10)\")\n",
|
| 891 |
+
" print(\" 💡 Система повернула все що знайшла, не викинула помилку\")\n",
|
| 892 |
+
"\n",
|
| 893 |
" # Показуємо перші 2 статті якщо є\n",
|
| 894 |
" if found > 0:\n",
|
| 895 |
+
" print(\"\\n 📚 Приклади знайденого:\")\n",
|
| 896 |
" for i, paper in enumerate(papers_test[:2], 1):\n",
|
| 897 |
" print(f\" {i}. {paper.title[:70]}...\")\n",
|
| 898 |
+
"\n",
|
| 899 |
" except ValueError as e:\n",
|
| 900 |
" print(f\" ❌ ValueError: {e}\")\n",
|
| 901 |
+
" print(\" 💡 Це означає що знайдено 0 статей з PDF — критична помилка!\")\n",
|
| 902 |
+
"\n",
|
| 903 |
" print()"
|
| 904 |
]
|
| 905 |
}
|
pyproject.toml
CHANGED
|
@@ -13,11 +13,12 @@ dependencies = [
|
|
| 13 |
"pydantic-settings>=2.0.0",
|
| 14 |
"python-dotenv>=1.0.0",
|
| 15 |
"httpx>=0.25.0",
|
| 16 |
-
"gradio
|
| 17 |
"rapidfuzz>=3.0.0",
|
| 18 |
"tenacity>=8.0.0",
|
| 19 |
"pymupdf>=1.26.7",
|
| 20 |
"litellm>=1.80.10",
|
|
|
|
| 21 |
]
|
| 22 |
|
| 23 |
[dependency-groups]
|
|
|
|
| 13 |
"pydantic-settings>=2.0.0",
|
| 14 |
"python-dotenv>=1.0.0",
|
| 15 |
"httpx>=0.25.0",
|
| 16 |
+
"gradio==6.1.0",
|
| 17 |
"rapidfuzz>=3.0.0",
|
| 18 |
"tenacity>=8.0.0",
|
| 19 |
"pymupdf>=1.26.7",
|
| 20 |
"litellm>=1.80.10",
|
| 21 |
+
"loguru>=0.7.3",
|
| 22 |
]
|
| 23 |
|
| 24 |
[dependency-groups]
|
requirements.txt
CHANGED
|
@@ -7,3 +7,4 @@ python-dotenv>=1.0.0
|
|
| 7 |
httpx>=0.25.0
|
| 8 |
rapidfuzz>=3.0.0
|
| 9 |
tenacity>=8.0.0
|
|
|
|
|
|
| 7 |
httpx>=0.25.0
|
| 8 |
rapidfuzz>=3.0.0
|
| 9 |
tenacity>=8.0.0
|
| 10 |
+
loguru>=0.7.0
|
src/paper_survey_agent/agent.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
| 1 |
import asyncio
|
| 2 |
-
import logging
|
| 3 |
from typing import Optional
|
| 4 |
|
|
|
|
|
|
|
| 5 |
from paper_survey_agent.models.paper import SummarizedPaper
|
| 6 |
from paper_survey_agent.tools import (
|
| 7 |
generate_search_query,
|
|
@@ -11,9 +12,6 @@ from paper_survey_agent.tools import (
|
|
| 11 |
)
|
| 12 |
|
| 13 |
|
| 14 |
-
logger = logging.getLogger(__name__)
|
| 15 |
-
|
| 16 |
-
|
| 17 |
class PaperSurveyAgent:
|
| 18 |
async def run(self, topic: str) -> tuple[list[SummarizedPaper], str] | None:
|
| 19 |
logger.info(f" Agent started for topic: '{topic}'")
|
|
|
|
| 1 |
import asyncio
|
|
|
|
| 2 |
from typing import Optional
|
| 3 |
|
| 4 |
+
from loguru import logger
|
| 5 |
+
|
| 6 |
from paper_survey_agent.models.paper import SummarizedPaper
|
| 7 |
from paper_survey_agent.tools import (
|
| 8 |
generate_search_query,
|
|
|
|
| 12 |
)
|
| 13 |
|
| 14 |
|
|
|
|
|
|
|
|
|
|
| 15 |
class PaperSurveyAgent:
|
| 16 |
async def run(self, topic: str) -> tuple[list[SummarizedPaper], str] | None:
|
| 17 |
logger.info(f" Agent started for topic: '{topic}'")
|
src/paper_survey_agent/apis/__init__.py
CHANGED
|
@@ -4,4 +4,5 @@ from paper_survey_agent.apis.arxiv import ArxivAPI
|
|
| 4 |
from paper_survey_agent.apis.base import BaseScientificAPI
|
| 5 |
from paper_survey_agent.apis.semantic_scholar import SemanticScholarAPI
|
| 6 |
|
|
|
|
| 7 |
__all__ = ["ArxivAPI", "SemanticScholarAPI", "BaseScientificAPI"]
|
|
|
|
| 4 |
from paper_survey_agent.apis.base import BaseScientificAPI
|
| 5 |
from paper_survey_agent.apis.semantic_scholar import SemanticScholarAPI
|
| 6 |
|
| 7 |
+
|
| 8 |
__all__ = ["ArxivAPI", "SemanticScholarAPI", "BaseScientificAPI"]
|
src/paper_survey_agent/apis/arxiv.py
CHANGED
|
@@ -1,10 +1,8 @@
|
|
| 1 |
-
"""arXiv API client for retrieving scientific papers."""
|
| 2 |
-
|
| 3 |
from datetime import datetime
|
| 4 |
-
import logging
|
| 5 |
from typing import Optional
|
| 6 |
|
| 7 |
import arxiv
|
|
|
|
| 8 |
from tenacity import retry, stop_after_attempt, wait_exponential
|
| 9 |
|
| 10 |
from paper_survey_agent.apis.base import BaseScientificAPI
|
|
@@ -12,9 +10,6 @@ from paper_survey_agent.models.paper import Paper
|
|
| 12 |
from paper_survey_agent.settings import settings
|
| 13 |
|
| 14 |
|
| 15 |
-
logger = logging.getLogger(__name__)
|
| 16 |
-
|
| 17 |
-
|
| 18 |
class ArxivAPI(BaseScientificAPI):
|
| 19 |
def __init__(self, page_size: int = settings.ARXIV_PAGE_SIZE, delay_seconds: int = settings.ARXIV_DELAY_SECONDS):
|
| 20 |
self.page_size = page_size
|
|
|
|
|
|
|
|
|
|
| 1 |
from datetime import datetime
|
|
|
|
| 2 |
from typing import Optional
|
| 3 |
|
| 4 |
import arxiv
|
| 5 |
+
from loguru import logger
|
| 6 |
from tenacity import retry, stop_after_attempt, wait_exponential
|
| 7 |
|
| 8 |
from paper_survey_agent.apis.base import BaseScientificAPI
|
|
|
|
| 10 |
from paper_survey_agent.settings import settings
|
| 11 |
|
| 12 |
|
|
|
|
|
|
|
|
|
|
| 13 |
class ArxivAPI(BaseScientificAPI):
|
| 14 |
def __init__(self, page_size: int = settings.ARXIV_PAGE_SIZE, delay_seconds: int = settings.ARXIV_DELAY_SECONDS):
|
| 15 |
self.page_size = page_size
|
src/paper_survey_agent/apis/semantic_scholar.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import asyncio
|
| 2 |
from datetime import datetime
|
| 3 |
-
import logging
|
| 4 |
from typing import Optional
|
| 5 |
|
| 6 |
import httpx
|
|
|
|
| 7 |
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
|
| 8 |
|
| 9 |
from paper_survey_agent.apis.base import BaseScientificAPI
|
|
@@ -11,9 +11,6 @@ from paper_survey_agent.models.paper import Paper
|
|
| 11 |
from paper_survey_agent.settings import settings
|
| 12 |
|
| 13 |
|
| 14 |
-
logger = logging.getLogger(__name__)
|
| 15 |
-
|
| 16 |
-
|
| 17 |
class SemanticScholarAPI(BaseScientificAPI):
|
| 18 |
BASE_URL = settings.SEMANTIC_SCHOLAR_API_BASE_URL
|
| 19 |
|
|
@@ -48,7 +45,7 @@ class SemanticScholarAPI(BaseScientificAPI):
|
|
| 48 |
timeout=timeout,
|
| 49 |
)
|
| 50 |
|
| 51 |
-
logger.info(f"Initialized SemanticScholarAPI
|
| 52 |
|
| 53 |
async def __aenter__(self):
|
| 54 |
return self
|
|
|
|
| 1 |
import asyncio
|
| 2 |
from datetime import datetime
|
|
|
|
| 3 |
from typing import Optional
|
| 4 |
|
| 5 |
import httpx
|
| 6 |
+
from loguru import logger
|
| 7 |
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
|
| 8 |
|
| 9 |
from paper_survey_agent.apis.base import BaseScientificAPI
|
|
|
|
| 11 |
from paper_survey_agent.settings import settings
|
| 12 |
|
| 13 |
|
|
|
|
|
|
|
|
|
|
| 14 |
class SemanticScholarAPI(BaseScientificAPI):
|
| 15 |
BASE_URL = settings.SEMANTIC_SCHOLAR_API_BASE_URL
|
| 16 |
|
|
|
|
| 45 |
timeout=timeout,
|
| 46 |
)
|
| 47 |
|
| 48 |
+
logger.info(f"Initialized SemanticScholarAPI (authenticated: {bool(api_key)}, timeout: {timeout}s)")
|
| 49 |
|
| 50 |
async def __aenter__(self):
|
| 51 |
return self
|
src/paper_survey_agent/llm/client.py
CHANGED
|
@@ -1,16 +1,13 @@
|
|
| 1 |
-
import logging
|
| 2 |
import os
|
| 3 |
from typing import Any
|
| 4 |
|
| 5 |
from litellm import completion
|
|
|
|
| 6 |
from tenacity import retry, stop_after_attempt, wait_exponential
|
| 7 |
|
| 8 |
from paper_survey_agent.settings import settings
|
| 9 |
|
| 10 |
|
| 11 |
-
logger = logging.getLogger(__name__)
|
| 12 |
-
|
| 13 |
-
|
| 14 |
class LLMClient:
|
| 15 |
def __init__(self):
|
| 16 |
self.model = settings.LLM_MODEL
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
from typing import Any
|
| 3 |
|
| 4 |
from litellm import completion
|
| 5 |
+
from loguru import logger
|
| 6 |
from tenacity import retry, stop_after_attempt, wait_exponential
|
| 7 |
|
| 8 |
from paper_survey_agent.settings import settings
|
| 9 |
|
| 10 |
|
|
|
|
|
|
|
|
|
|
| 11 |
class LLMClient:
|
| 12 |
def __init__(self):
|
| 13 |
self.model = settings.LLM_MODEL
|
src/paper_survey_agent/models/__init__.py
CHANGED
|
@@ -2,4 +2,5 @@
|
|
| 2 |
|
| 3 |
from .paper import Paper
|
| 4 |
|
|
|
|
| 5 |
__all__ = ["Paper"]
|
|
|
|
| 2 |
|
| 3 |
from .paper import Paper
|
| 4 |
|
| 5 |
+
|
| 6 |
__all__ = ["Paper"]
|
src/paper_survey_agent/settings.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
import os
|
| 2 |
from pathlib import Path
|
| 3 |
|
| 4 |
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
|
|
|
|
|
| 1 |
from pathlib import Path
|
| 2 |
|
| 3 |
from pydantic_settings import BaseSettings, SettingsConfigDict
|
src/paper_survey_agent/tools/generate_search_query/generate_search_query.py
CHANGED
|
@@ -1,12 +1,9 @@
|
|
| 1 |
-
import
|
| 2 |
|
| 3 |
from paper_survey_agent.llm.client import llm_client
|
| 4 |
from paper_survey_agent.llm.prompts import SEARCH_QUERY_REFINEMENT_SYSTEM_PROMPT
|
| 5 |
|
| 6 |
|
| 7 |
-
logger = logging.getLogger(__name__)
|
| 8 |
-
|
| 9 |
-
|
| 10 |
def generate_search_query(user_query: str) -> str:
|
| 11 |
logger.info(f"🧠 Refining user query: '{user_query}'")
|
| 12 |
|
|
|
|
| 1 |
+
from loguru import logger
|
| 2 |
|
| 3 |
from paper_survey_agent.llm.client import llm_client
|
| 4 |
from paper_survey_agent.llm.prompts import SEARCH_QUERY_REFINEMENT_SYSTEM_PROMPT
|
| 5 |
|
| 6 |
|
|
|
|
|
|
|
|
|
|
| 7 |
def generate_search_query(user_query: str) -> str:
|
| 8 |
logger.info(f"🧠 Refining user query: '{user_query}'")
|
| 9 |
|
src/paper_survey_agent/tools/search_and_load_papers_txt/search_and_load_papers_txt.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
| 1 |
import asyncio
|
| 2 |
-
import logging
|
| 3 |
import math
|
| 4 |
|
|
|
|
|
|
|
| 5 |
from paper_survey_agent.models.paper import ProcessedPaper
|
| 6 |
from paper_survey_agent.settings import settings
|
| 7 |
from paper_survey_agent.tools.search_and_load_papers_txt.utils.downloader import download_papers
|
|
@@ -11,9 +12,6 @@ from paper_survey_agent.tools.search_and_load_papers_txt.utils.retrieval import
|
|
| 11 |
from paper_survey_agent.tools.search_and_load_papers_txt.utils.text_extractor import convert_pdfs_to_text
|
| 12 |
|
| 13 |
|
| 14 |
-
logger = logging.getLogger(__name__)
|
| 15 |
-
|
| 16 |
-
|
| 17 |
async def search_and_load_papers_txt(query: str) -> list[ProcessedPaper]:
|
| 18 |
clear_data_directory()
|
| 19 |
|
|
@@ -45,7 +43,7 @@ async def search_and_load_papers_txt(query: str) -> list[ProcessedPaper]:
|
|
| 45 |
current_index = batch_end
|
| 46 |
|
| 47 |
if len(downloaded_pdfs) < target_count:
|
| 48 |
-
logger.warning(f"Pipeline finished with {len(downloaded_pdfs)} papers,
|
| 49 |
|
| 50 |
txt_paths = await convert_pdfs_to_text(downloaded_pdfs)
|
| 51 |
|
|
|
|
| 1 |
import asyncio
|
|
|
|
| 2 |
import math
|
| 3 |
|
| 4 |
+
from loguru import logger
|
| 5 |
+
|
| 6 |
from paper_survey_agent.models.paper import ProcessedPaper
|
| 7 |
from paper_survey_agent.settings import settings
|
| 8 |
from paper_survey_agent.tools.search_and_load_papers_txt.utils.downloader import download_papers
|
|
|
|
| 12 |
from paper_survey_agent.tools.search_and_load_papers_txt.utils.text_extractor import convert_pdfs_to_text
|
| 13 |
|
| 14 |
|
|
|
|
|
|
|
|
|
|
| 15 |
async def search_and_load_papers_txt(query: str) -> list[ProcessedPaper]:
|
| 16 |
clear_data_directory()
|
| 17 |
|
|
|
|
| 43 |
current_index = batch_end
|
| 44 |
|
| 45 |
if len(downloaded_pdfs) < target_count:
|
| 46 |
+
logger.warning(f"Pipeline finished with {len(downloaded_pdfs)} papers, short of target {target_count}.")
|
| 47 |
|
| 48 |
txt_paths = await convert_pdfs_to_text(downloaded_pdfs)
|
| 49 |
|
src/paper_survey_agent/tools/search_and_load_papers_txt/utils/downloader.py
CHANGED
|
@@ -1,19 +1,16 @@
|
|
| 1 |
import asyncio
|
| 2 |
-
import logging
|
| 3 |
from pathlib import Path
|
| 4 |
import re
|
| 5 |
from typing import Optional
|
| 6 |
|
| 7 |
import httpx
|
|
|
|
| 8 |
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
|
| 9 |
|
| 10 |
from paper_survey_agent.models.paper import Paper
|
| 11 |
from paper_survey_agent.settings import settings
|
| 12 |
|
| 13 |
|
| 14 |
-
logger = logging.getLogger(__name__)
|
| 15 |
-
|
| 16 |
-
|
| 17 |
async def download_papers(papers: list[Paper], destination_dir: str | Path | None = None) -> dict[str, Path]:
|
| 18 |
if destination_dir is None:
|
| 19 |
destination_dir = Path(settings.DATA_DIR) / "pdfs"
|
|
|
|
| 1 |
import asyncio
|
|
|
|
| 2 |
from pathlib import Path
|
| 3 |
import re
|
| 4 |
from typing import Optional
|
| 5 |
|
| 6 |
import httpx
|
| 7 |
+
from loguru import logger
|
| 8 |
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
|
| 9 |
|
| 10 |
from paper_survey_agent.models.paper import Paper
|
| 11 |
from paper_survey_agent.settings import settings
|
| 12 |
|
| 13 |
|
|
|
|
|
|
|
|
|
|
| 14 |
async def download_papers(papers: list[Paper], destination_dir: str | Path | None = None) -> dict[str, Path]:
|
| 15 |
if destination_dir is None:
|
| 16 |
destination_dir = Path(settings.DATA_DIR) / "pdfs"
|
src/paper_survey_agent/tools/search_and_load_papers_txt/utils/ranking.py
CHANGED
|
@@ -1,18 +1,15 @@
|
|
| 1 |
from collections import Counter
|
| 2 |
from datetime import datetime
|
| 3 |
-
import logging
|
| 4 |
import re
|
| 5 |
from typing import Optional
|
| 6 |
|
|
|
|
| 7 |
from rapidfuzz import fuzz
|
| 8 |
|
| 9 |
from paper_survey_agent.models.paper import Paper
|
| 10 |
from paper_survey_agent.settings import settings
|
| 11 |
|
| 12 |
|
| 13 |
-
logger = logging.getLogger(__name__)
|
| 14 |
-
|
| 15 |
-
|
| 16 |
def rank_and_deduplicate(
|
| 17 |
papers: list[Paper],
|
| 18 |
topic: str,
|
|
@@ -42,7 +39,7 @@ def rank_and_deduplicate(
|
|
| 42 |
if scored_papers:
|
| 43 |
logger.info(
|
| 44 |
f"Returning top {len(top_papers)} papers. "
|
| 45 |
-
f"Score range: {scored_papers[0][1]:.3f} - {scored_papers[min(top_k-1, len(scored_papers)-1)][1]:.3f}"
|
| 46 |
)
|
| 47 |
|
| 48 |
return top_papers
|
|
@@ -64,9 +61,7 @@ def _deduplicate_papers(papers: list[Paper], fuzzy_threshold: int) -> list[Paper
|
|
| 64 |
for seen_title, seen_paper in seen_titles:
|
| 65 |
similarity = fuzz.ratio(normalized_title, seen_title)
|
| 66 |
if similarity >= fuzzy_threshold:
|
| 67 |
-
logger.debug(
|
| 68 |
-
f"Fuzzy duplicate found ({similarity}% similar): " f"'{paper.title}' ≈ '{seen_paper.title}'"
|
| 69 |
-
)
|
| 70 |
if paper.citations_count and not seen_paper.citations_count:
|
| 71 |
deduplicated.remove(seen_paper)
|
| 72 |
seen_ids.remove(seen_paper.id)
|
|
|
|
| 1 |
from collections import Counter
|
| 2 |
from datetime import datetime
|
|
|
|
| 3 |
import re
|
| 4 |
from typing import Optional
|
| 5 |
|
| 6 |
+
from loguru import logger
|
| 7 |
from rapidfuzz import fuzz
|
| 8 |
|
| 9 |
from paper_survey_agent.models.paper import Paper
|
| 10 |
from paper_survey_agent.settings import settings
|
| 11 |
|
| 12 |
|
|
|
|
|
|
|
|
|
|
| 13 |
def rank_and_deduplicate(
|
| 14 |
papers: list[Paper],
|
| 15 |
topic: str,
|
|
|
|
| 39 |
if scored_papers:
|
| 40 |
logger.info(
|
| 41 |
f"Returning top {len(top_papers)} papers. "
|
| 42 |
+
f"Score range: {scored_papers[0][1]:.3f} - {scored_papers[min(top_k - 1, len(scored_papers) - 1)][1]:.3f}"
|
| 43 |
)
|
| 44 |
|
| 45 |
return top_papers
|
|
|
|
| 61 |
for seen_title, seen_paper in seen_titles:
|
| 62 |
similarity = fuzz.ratio(normalized_title, seen_title)
|
| 63 |
if similarity >= fuzzy_threshold:
|
| 64 |
+
logger.debug(f"Fuzzy duplicate found ({similarity}% similar): '{paper.title}' ≈ '{seen_paper.title}'")
|
|
|
|
|
|
|
| 65 |
if paper.citations_count and not seen_paper.citations_count:
|
| 66 |
deduplicated.remove(seen_paper)
|
| 67 |
seen_ids.remove(seen_paper.id)
|
src/paper_survey_agent/tools/search_and_load_papers_txt/utils/retrieval.py
CHANGED
|
@@ -1,16 +1,14 @@
|
|
| 1 |
import asyncio
|
| 2 |
-
import logging
|
| 3 |
import os
|
| 4 |
from typing import Optional
|
| 5 |
|
|
|
|
|
|
|
| 6 |
from paper_survey_agent.apis import ArxivAPI, SemanticScholarAPI
|
| 7 |
from paper_survey_agent.models.paper import Paper
|
| 8 |
from paper_survey_agent.settings import settings
|
| 9 |
|
| 10 |
|
| 11 |
-
logger = logging.getLogger(__name__)
|
| 12 |
-
|
| 13 |
-
|
| 14 |
async def retrieve_papers(
|
| 15 |
query: str,
|
| 16 |
sources: list[str] | None = None,
|
|
@@ -27,7 +25,7 @@ async def retrieve_papers(
|
|
| 27 |
if semantic_scholar_api_key:
|
| 28 |
logger.info("Using Semantic Scholar API key from environment")
|
| 29 |
|
| 30 |
-
logger.info(f"Retrieving papers: query='{query}', sources={sources},
|
| 31 |
|
| 32 |
tasks = []
|
| 33 |
|
|
@@ -66,9 +64,7 @@ async def retrieve_papers(
|
|
| 66 |
)
|
| 67 |
|
| 68 |
if not all_papers:
|
| 69 |
-
raise ValueError(
|
| 70 |
-
f"Failed to retrieve papers from all sources. " f"Attempted: {sources}, Failed: {failed_sources}"
|
| 71 |
-
)
|
| 72 |
|
| 73 |
return all_papers
|
| 74 |
|
|
@@ -183,7 +179,7 @@ async def _fetch_from_semantic_scholar(
|
|
| 183 |
)
|
| 184 |
else:
|
| 185 |
logger.info(
|
| 186 |
-
f"Semantic Scholar returned {len(papers_with_pdf)} papers with PDF
|
| 187 |
)
|
| 188 |
|
| 189 |
return papers_with_pdf
|
|
|
|
| 1 |
import asyncio
|
|
|
|
| 2 |
import os
|
| 3 |
from typing import Optional
|
| 4 |
|
| 5 |
+
from loguru import logger
|
| 6 |
+
|
| 7 |
from paper_survey_agent.apis import ArxivAPI, SemanticScholarAPI
|
| 8 |
from paper_survey_agent.models.paper import Paper
|
| 9 |
from paper_survey_agent.settings import settings
|
| 10 |
|
| 11 |
|
|
|
|
|
|
|
|
|
|
| 12 |
async def retrieve_papers(
|
| 13 |
query: str,
|
| 14 |
sources: list[str] | None = None,
|
|
|
|
| 25 |
if semantic_scholar_api_key:
|
| 26 |
logger.info("Using Semantic Scholar API key from environment")
|
| 27 |
|
| 28 |
+
logger.info(f"Retrieving papers: query='{query}', sources={sources}, max_per_source={max_results_per_source}")
|
| 29 |
|
| 30 |
tasks = []
|
| 31 |
|
|
|
|
| 64 |
)
|
| 65 |
|
| 66 |
if not all_papers:
|
| 67 |
+
raise ValueError(f"Failed to retrieve papers from all sources. Attempted: {sources}, Failed: {failed_sources}")
|
|
|
|
|
|
|
| 68 |
|
| 69 |
return all_papers
|
| 70 |
|
|
|
|
| 179 |
)
|
| 180 |
else:
|
| 181 |
logger.info(
|
| 182 |
+
f"Semantic Scholar returned {len(papers_with_pdf)} papers with PDF (target: {max_results})"
|
| 183 |
)
|
| 184 |
|
| 185 |
return papers_with_pdf
|
src/paper_survey_agent/tools/search_and_load_papers_txt/utils/text_extractor.py
CHANGED
|
@@ -1,16 +1,13 @@
|
|
| 1 |
import asyncio
|
| 2 |
-
import logging
|
| 3 |
from pathlib import Path
|
| 4 |
|
| 5 |
import aiofiles
|
|
|
|
| 6 |
import pymupdf
|
| 7 |
|
| 8 |
from paper_survey_agent.settings import settings
|
| 9 |
|
| 10 |
|
| 11 |
-
logger = logging.getLogger(__name__)
|
| 12 |
-
|
| 13 |
-
|
| 14 |
def extract_text_from_pdf_sync(pdf_path: Path) -> str:
|
| 15 |
try:
|
| 16 |
with pymupdf.open(pdf_path) as doc:
|
|
|
|
| 1 |
import asyncio
|
|
|
|
| 2 |
from pathlib import Path
|
| 3 |
|
| 4 |
import aiofiles
|
| 5 |
+
from loguru import logger
|
| 6 |
import pymupdf
|
| 7 |
|
| 8 |
from paper_survey_agent.settings import settings
|
| 9 |
|
| 10 |
|
|
|
|
|
|
|
|
|
|
| 11 |
def extract_text_from_pdf_sync(pdf_path: Path) -> str:
|
| 12 |
try:
|
| 13 |
with pymupdf.open(pdf_path) as doc:
|
src/paper_survey_agent/tools/summarize_papers/summarize_papers.py
CHANGED
|
@@ -1,18 +1,15 @@
|
|
| 1 |
import asyncio
|
| 2 |
import json
|
| 3 |
-
import logging
|
| 4 |
from typing import Any
|
| 5 |
|
| 6 |
import aiofiles
|
|
|
|
| 7 |
|
| 8 |
from paper_survey_agent.llm.client import llm_client
|
| 9 |
from paper_survey_agent.llm.prompts import PAPER_SUMMARIZATION_SYSTEM_PROMPT
|
| 10 |
from paper_survey_agent.models.paper import ProcessedPaper, SummarizedPaper
|
| 11 |
|
| 12 |
|
| 13 |
-
logger = logging.getLogger(__name__)
|
| 14 |
-
|
| 15 |
-
|
| 16 |
def parse_llm_json(text: str) -> dict[str, Any]:
|
| 17 |
cleaned = text.strip()
|
| 18 |
|
|
|
|
| 1 |
import asyncio
|
| 2 |
import json
|
|
|
|
| 3 |
from typing import Any
|
| 4 |
|
| 5 |
import aiofiles
|
| 6 |
+
from loguru import logger
|
| 7 |
|
| 8 |
from paper_survey_agent.llm.client import llm_client
|
| 9 |
from paper_survey_agent.llm.prompts import PAPER_SUMMARIZATION_SYSTEM_PROMPT
|
| 10 |
from paper_survey_agent.models.paper import ProcessedPaper, SummarizedPaper
|
| 11 |
|
| 12 |
|
|
|
|
|
|
|
|
|
|
| 13 |
def parse_llm_json(text: str) -> dict[str, Any]:
|
| 14 |
cleaned = text.strip()
|
| 15 |
|
src/paper_survey_agent/tools/synthesize_survey/synthesize_survey.py
CHANGED
|
@@ -1,14 +1,12 @@
|
|
| 1 |
import asyncio
|
| 2 |
-
|
|
|
|
| 3 |
|
| 4 |
from paper_survey_agent.llm.client import llm_client
|
| 5 |
from paper_survey_agent.llm.prompts import SURVEY_SYNTHESIS_SYSTEM_PROMPT
|
| 6 |
from paper_survey_agent.models.paper import SummarizedPaper
|
| 7 |
|
| 8 |
|
| 9 |
-
logger = logging.getLogger(__name__)
|
| 10 |
-
|
| 11 |
-
|
| 12 |
def format_papers_for_synthesis(papers: list[SummarizedPaper]) -> str:
|
| 13 |
context_parts = []
|
| 14 |
|
|
|
|
| 1 |
import asyncio
|
| 2 |
+
|
| 3 |
+
from loguru import logger
|
| 4 |
|
| 5 |
from paper_survey_agent.llm.client import llm_client
|
| 6 |
from paper_survey_agent.llm.prompts import SURVEY_SYNTHESIS_SYSTEM_PROMPT
|
| 7 |
from paper_survey_agent.models.paper import SummarizedPaper
|
| 8 |
|
| 9 |
|
|
|
|
|
|
|
|
|
|
| 10 |
def format_papers_for_synthesis(papers: list[SummarizedPaper]) -> str:
|
| 11 |
context_parts = []
|
| 12 |
|
uv.lock
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|