DenysKovalML commited on
Commit
3bf7b2c
·
1 Parent(s): 1caea61

style: format code

Browse files
.env.dist CHANGED
@@ -1,18 +1,14 @@
1
- # OpenAI API Configuration
2
- OPENAI_API_KEY=your-openai-api-key-here
3
-
4
- # Model Configuration
5
- MODEL_NAME=gpt-4o-mini
6
- TEMPERATURE=0.7
7
- MAX_TOKENS=4000
8
 
9
  # Search Configuration
10
- MAX_PAPERS=15
11
  MAX_RESULTS_PER_SOURCE=20
12
  SEARCH_TIMEOUT=300
13
 
14
  # Semantic Scholar API (optional, but recommended for higher rate limits)
15
  SEMANTIC_SCHOLAR_API_KEY=your-semantic-scholar-api-key-here
16
-
17
- # Logging
18
- LOG_LEVEL=INFO
 
1
+ # LLM Configuration
2
+ LLM_PROVIDER=openrouter
3
+ LLM_API_KEY=
4
+ LLM_MODEL=google/gemini-2.0-flash-exp:free
5
+ LLM_TEMPERATURE=0.7
6
+ LLM_MAX_TOKENS=4000
 
7
 
8
  # Search Configuration
9
+ MAX_PAPERS_TO_RETURN=10
10
  MAX_RESULTS_PER_SOURCE=20
11
  SEARCH_TIMEOUT=300
12
 
13
  # Semantic Scholar API (optional, but recommended for higher rate limits)
14
  SEMANTIC_SCHOLAR_API_KEY=your-semantic-scholar-api-key-here
 
 
 
.gitignore CHANGED
@@ -31,6 +31,8 @@ MANIFEST
31
  # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
  *.manifest
33
  *.spec
 
 
34
 
35
  # Installer logs
36
  pip-log.txt
 
31
  # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
  *.manifest
33
  *.spec
34
+ .env*
35
+ !.env.dist
36
 
37
  # Installer logs
38
  pip-log.txt
notebooks/test_agent.ipynb CHANGED
@@ -53,14 +53,16 @@
53
  ],
54
  "source": [
55
  "import asyncio\n",
56
- "import sys\n",
57
  "from pathlib import Path\n",
 
 
58
  "\n",
59
  "# Додаємо src до path\n",
60
- "sys.path.insert(0, str(Path.cwd().parent / 'src'))\n",
61
  "\n",
62
- "from paper_survey_agent.tools import retrieve_papers, rank_and_deduplicate\n",
63
  "from paper_survey_agent.models.paper import Paper\n",
 
 
64
  "\n",
65
  "print(\"✅ Модулі успішно імпортовано!\")"
66
  ]
@@ -99,11 +101,7 @@
99
  "query = \"transformer models\"\n",
100
  "print(f\"🔍 Шукаємо статті за запитом: '{query}'...\\n\")\n",
101
  "\n",
102
- "papers = await retrieve_papers(\n",
103
- " query=query,\n",
104
- " sources=[\"arxiv\", \"semantic_scholar\"],\n",
105
- " max_results_per_source=10\n",
106
- ")\n",
107
  "\n",
108
  "print(f\"\\n✅ Знайдено {len(papers)} статей\")"
109
  ]
@@ -209,12 +207,7 @@
209
  "source": [
210
  "print(f\"🔄 Ранжуємо {len(papers)} статей...\\n\")\n",
211
  "\n",
212
- "ranked_papers = rank_and_deduplicate(\n",
213
- " papers=papers,\n",
214
- " topic=query,\n",
215
- " top_k=10,\n",
216
- " fuzzy_threshold=85\n",
217
- ")\n",
218
  "\n",
219
  "print(f\"\\n✅ Топ-{len(ranked_papers)} найрелевантніших статей (після дедуплікації)\")"
220
  ]
@@ -648,22 +641,14 @@
648
  "query2 = \"attention mechanisms in neural networks\"\n",
649
  "print(f\"🔍 Шукаємо статті за запитом: '{query2}'...\\n\")\n",
650
  "\n",
651
- "papers2 = await retrieve_papers(\n",
652
- " query=query2,\n",
653
- " sources=[\"arxiv\", \"semantic_scholar\"],\n",
654
- " max_results_per_source=8\n",
655
- ")\n",
656
  "\n",
657
  "print(f\"\\n✅ Знайдено {len(papers2)} статей\")\n",
658
  "\n",
659
  "# Ранжування\n",
660
- "ranked_papers2 = rank_and_deduplicate(\n",
661
- " papers=papers2,\n",
662
- " topic=query2,\n",
663
- " top_k=5\n",
664
- ")\n",
665
  "\n",
666
- "print(f\"\\n🏆 Топ-5 після ранжування:\\n\")\n",
667
  "for i, paper in enumerate(ranked_papers2, 1):\n",
668
  " print(f\"{i}. {paper.title}\")\n",
669
  " print(f\" 📅 {paper.published_date} | 📖 {paper.citations_count or 'N/A'} цитувань\")\n",
@@ -717,6 +702,7 @@
717
  "from collections import Counter\n",
718
  "from datetime import datetime\n",
719
  "\n",
 
720
  "print(\"📊 Статистика знайдених статей:\\n\")\n",
721
  "print(f\"Всього знайдено: {len(papers)}\")\n",
722
  "print(f\"Після дедуплікації: {len(ranked_papers)}\")\n",
@@ -724,14 +710,14 @@
724
  "\n",
725
  "# Статистика по джерелам\n",
726
  "sources = Counter(p.source for p in papers)\n",
727
- "print(f\"\\nПо джерелам:\")\n",
728
  "for source, count in sources.items():\n",
729
  " print(f\" - {source}: {count}\")\n",
730
  "\n",
731
  "# Статистика по рокам\n",
732
  "years = [p.published_date.year for p in ranked_papers]\n",
733
  "year_counts = Counter(years)\n",
734
- "print(f\"\\nПо рокам публікації (топ-10):\")\n",
735
  "for year, count in sorted(year_counts.items(), reverse=True):\n",
736
  " print(f\" - {year}: {count}\")\n",
737
  "\n",
@@ -745,7 +731,7 @@
745
  "\n",
746
  "# Наявність PDF\n",
747
  "with_pdf = sum(1 for p in ranked_papers if p.pdf_url)\n",
748
- "print(f\"\\nСтатей з PDF: {with_pdf}/{len(ranked_papers)} ({with_pdf/len(ranked_papers)*100:.1f}%)\")"
749
  ]
750
  },
751
  {
@@ -885,39 +871,35 @@
885
  "test_queries = [\n",
886
  " (\"machine learning\", \"Популярна тема (очікуємо 10 статей)\"),\n",
887
  " (\"lemon juice\", \"Рідкісна тема (можливо менше 10)\"),\n",
888
- " (\"zzxxyywwqqppvvkkjjhhggffddssaammnnbbccll123456789\", \"Абсолютно випадковий набір (очікуємо 0 → ValueError)\")\n",
889
  "]\n",
890
  "\n",
891
  "for query, description in test_queries:\n",
892
  " print(f\"\\n📝 Запит: '{query}'\")\n",
893
  " print(f\" Опис: {description}\")\n",
894
  " print(\"-\" * 80)\n",
895
- " \n",
896
  " try:\n",
897
- " papers_test = await retrieve_papers(\n",
898
- " query=query,\n",
899
- " sources=[\"arxiv\"],\n",
900
- " max_results_per_source=10\n",
901
- " )\n",
902
- " \n",
903
  " found = len(papers_test)\n",
904
- " \n",
905
  " if found == 10:\n",
906
  " print(f\" ✅ Знайдено {found} статей з PDF — рівно стільки скільки просили!\")\n",
907
  " elif found > 0:\n",
908
  " print(f\" ⚠️ Знайдено {found} статей з PDF (менше ніж 10)\")\n",
909
- " print(f\" 💡 Система повернула все що знайшла, не викинула помилку\")\n",
910
- " \n",
911
  " # Показуємо перші 2 статті якщо є\n",
912
  " if found > 0:\n",
913
- " print(f\"\\n 📚 Приклади знайденого:\")\n",
914
  " for i, paper in enumerate(papers_test[:2], 1):\n",
915
  " print(f\" {i}. {paper.title[:70]}...\")\n",
916
- " \n",
917
  " except ValueError as e:\n",
918
  " print(f\" ❌ ValueError: {e}\")\n",
919
- " print(f\" 💡 Це означає що знайдено 0 статей з PDF — критична помилка!\")\n",
920
- " \n",
921
  " print()"
922
  ]
923
  }
 
53
  ],
54
  "source": [
55
  "import asyncio\n",
 
56
  "from pathlib import Path\n",
57
+ "import sys\n",
58
+ "\n",
59
  "\n",
60
  "# Додаємо src до path\n",
61
+ "sys.path.insert(0, str(Path.cwd().parent / \"src\"))\n",
62
  "\n",
 
63
  "from paper_survey_agent.models.paper import Paper\n",
64
+ "from paper_survey_agent.tools import rank_and_deduplicate, retrieve_papers\n",
65
+ "\n",
66
  "\n",
67
  "print(\"✅ Модулі успішно імпортовано!\")"
68
  ]
 
101
  "query = \"transformer models\"\n",
102
  "print(f\"🔍 Шукаємо статті за запитом: '{query}'...\\n\")\n",
103
  "\n",
104
+ "papers = await retrieve_papers(query=query, sources=[\"arxiv\", \"semantic_scholar\"], max_results_per_source=10)\n",
 
 
 
 
105
  "\n",
106
  "print(f\"\\n✅ Знайдено {len(papers)} статей\")"
107
  ]
 
207
  "source": [
208
  "print(f\"🔄 Ранжуємо {len(papers)} статей...\\n\")\n",
209
  "\n",
210
+ "ranked_papers = rank_and_deduplicate(papers=papers, topic=query, top_k=10, fuzzy_threshold=85)\n",
 
 
 
 
 
211
  "\n",
212
  "print(f\"\\n✅ Топ-{len(ranked_papers)} найрелевантніших статей (після дедуплікації)\")"
213
  ]
 
641
  "query2 = \"attention mechanisms in neural networks\"\n",
642
  "print(f\"🔍 Шукаємо статті за запитом: '{query2}'...\\n\")\n",
643
  "\n",
644
+ "papers2 = await retrieve_papers(query=query2, sources=[\"arxiv\", \"semantic_scholar\"], max_results_per_source=8)\n",
 
 
 
 
645
  "\n",
646
  "print(f\"\\n✅ Знайдено {len(papers2)} статей\")\n",
647
  "\n",
648
  "# Ранжування\n",
649
+ "ranked_papers2 = rank_and_deduplicate(papers=papers2, topic=query2, top_k=5)\n",
 
 
 
 
650
  "\n",
651
+ "print(\"\\n🏆 Топ-5 після ранжування:\\n\")\n",
652
  "for i, paper in enumerate(ranked_papers2, 1):\n",
653
  " print(f\"{i}. {paper.title}\")\n",
654
  " print(f\" 📅 {paper.published_date} | 📖 {paper.citations_count or 'N/A'} цитувань\")\n",
 
702
  "from collections import Counter\n",
703
  "from datetime import datetime\n",
704
  "\n",
705
+ "\n",
706
  "print(\"📊 Статистика знайдених статей:\\n\")\n",
707
  "print(f\"Всього знайдено: {len(papers)}\")\n",
708
  "print(f\"Після дедуплікації: {len(ranked_papers)}\")\n",
 
710
  "\n",
711
  "# Статистика по джерелам\n",
712
  "sources = Counter(p.source for p in papers)\n",
713
+ "print(\"\\nПо джерелам:\")\n",
714
  "for source, count in sources.items():\n",
715
  " print(f\" - {source}: {count}\")\n",
716
  "\n",
717
  "# Статистика по рокам\n",
718
  "years = [p.published_date.year for p in ranked_papers]\n",
719
  "year_counts = Counter(years)\n",
720
+ "print(\"\\nПо рокам публікації (топ-10):\")\n",
721
  "for year, count in sorted(year_counts.items(), reverse=True):\n",
722
  " print(f\" - {year}: {count}\")\n",
723
  "\n",
 
731
  "\n",
732
  "# Наявність PDF\n",
733
  "with_pdf = sum(1 for p in ranked_papers if p.pdf_url)\n",
734
+ "print(f\"\\nСтатей з PDF: {with_pdf}/{len(ranked_papers)} ({with_pdf / len(ranked_papers) * 100:.1f}%)\")"
735
  ]
736
  },
737
  {
 
871
  "test_queries = [\n",
872
  " (\"machine learning\", \"Популярна тема (очікуємо 10 статей)\"),\n",
873
  " (\"lemon juice\", \"Рідкісна тема (можливо менше 10)\"),\n",
874
+ " (\"zzxxyywwqqppvvkkjjhhggffddssaammnnbbccll123456789\", \"Абсолютно випадковий набір (очікуємо 0 → ValueError)\"),\n",
875
  "]\n",
876
  "\n",
877
  "for query, description in test_queries:\n",
878
  " print(f\"\\n📝 Запит: '{query}'\")\n",
879
  " print(f\" Опис: {description}\")\n",
880
  " print(\"-\" * 80)\n",
881
+ "\n",
882
  " try:\n",
883
+ " papers_test = await retrieve_papers(query=query, sources=[\"arxiv\"], max_results_per_source=10)\n",
884
+ "\n",
 
 
 
 
885
  " found = len(papers_test)\n",
886
+ "\n",
887
  " if found == 10:\n",
888
  " print(f\" ✅ Знайдено {found} статей з PDF — рівно стільки скільки просили!\")\n",
889
  " elif found > 0:\n",
890
  " print(f\" ⚠️ Знайдено {found} статей з PDF (менше ніж 10)\")\n",
891
+ " print(\" 💡 Система повернула все що знайшла, не викинула помилку\")\n",
892
+ "\n",
893
  " # Показуємо перші 2 статті якщо є\n",
894
  " if found > 0:\n",
895
+ " print(\"\\n 📚 Приклади знайденого:\")\n",
896
  " for i, paper in enumerate(papers_test[:2], 1):\n",
897
  " print(f\" {i}. {paper.title[:70]}...\")\n",
898
+ "\n",
899
  " except ValueError as e:\n",
900
  " print(f\" ❌ ValueError: {e}\")\n",
901
+ " print(\" 💡 Це означає що знайдено 0 статей з PDF — критична помилка!\")\n",
902
+ "\n",
903
  " print()"
904
  ]
905
  }
pyproject.toml CHANGED
@@ -13,11 +13,12 @@ dependencies = [
13
  "pydantic-settings>=2.0.0",
14
  "python-dotenv>=1.0.0",
15
  "httpx>=0.25.0",
16
- "gradio>=4.0.0",
17
  "rapidfuzz>=3.0.0",
18
  "tenacity>=8.0.0",
19
  "pymupdf>=1.26.7",
20
  "litellm>=1.80.10",
 
21
  ]
22
 
23
  [dependency-groups]
 
13
  "pydantic-settings>=2.0.0",
14
  "python-dotenv>=1.0.0",
15
  "httpx>=0.25.0",
16
+ "gradio==6.1.0",
17
  "rapidfuzz>=3.0.0",
18
  "tenacity>=8.0.0",
19
  "pymupdf>=1.26.7",
20
  "litellm>=1.80.10",
21
+ "loguru>=0.7.3",
22
  ]
23
 
24
  [dependency-groups]
requirements.txt CHANGED
@@ -7,3 +7,4 @@ python-dotenv>=1.0.0
7
  httpx>=0.25.0
8
  rapidfuzz>=3.0.0
9
  tenacity>=8.0.0
 
 
7
  httpx>=0.25.0
8
  rapidfuzz>=3.0.0
9
  tenacity>=8.0.0
10
+ loguru>=0.7.0
src/paper_survey_agent/agent.py CHANGED
@@ -1,7 +1,8 @@
1
  import asyncio
2
- import logging
3
  from typing import Optional
4
 
 
 
5
  from paper_survey_agent.models.paper import SummarizedPaper
6
  from paper_survey_agent.tools import (
7
  generate_search_query,
@@ -11,9 +12,6 @@ from paper_survey_agent.tools import (
11
  )
12
 
13
 
14
- logger = logging.getLogger(__name__)
15
-
16
-
17
  class PaperSurveyAgent:
18
  async def run(self, topic: str) -> tuple[list[SummarizedPaper], str] | None:
19
  logger.info(f" Agent started for topic: '{topic}'")
 
1
  import asyncio
 
2
  from typing import Optional
3
 
4
+ from loguru import logger
5
+
6
  from paper_survey_agent.models.paper import SummarizedPaper
7
  from paper_survey_agent.tools import (
8
  generate_search_query,
 
12
  )
13
 
14
 
 
 
 
15
  class PaperSurveyAgent:
16
  async def run(self, topic: str) -> tuple[list[SummarizedPaper], str] | None:
17
  logger.info(f" Agent started for topic: '{topic}'")
src/paper_survey_agent/apis/__init__.py CHANGED
@@ -4,4 +4,5 @@ from paper_survey_agent.apis.arxiv import ArxivAPI
4
  from paper_survey_agent.apis.base import BaseScientificAPI
5
  from paper_survey_agent.apis.semantic_scholar import SemanticScholarAPI
6
 
 
7
  __all__ = ["ArxivAPI", "SemanticScholarAPI", "BaseScientificAPI"]
 
4
  from paper_survey_agent.apis.base import BaseScientificAPI
5
  from paper_survey_agent.apis.semantic_scholar import SemanticScholarAPI
6
 
7
+
8
  __all__ = ["ArxivAPI", "SemanticScholarAPI", "BaseScientificAPI"]
src/paper_survey_agent/apis/arxiv.py CHANGED
@@ -1,10 +1,8 @@
1
- """arXiv API client for retrieving scientific papers."""
2
-
3
  from datetime import datetime
4
- import logging
5
  from typing import Optional
6
 
7
  import arxiv
 
8
  from tenacity import retry, stop_after_attempt, wait_exponential
9
 
10
  from paper_survey_agent.apis.base import BaseScientificAPI
@@ -12,9 +10,6 @@ from paper_survey_agent.models.paper import Paper
12
  from paper_survey_agent.settings import settings
13
 
14
 
15
- logger = logging.getLogger(__name__)
16
-
17
-
18
  class ArxivAPI(BaseScientificAPI):
19
  def __init__(self, page_size: int = settings.ARXIV_PAGE_SIZE, delay_seconds: int = settings.ARXIV_DELAY_SECONDS):
20
  self.page_size = page_size
 
 
 
1
  from datetime import datetime
 
2
  from typing import Optional
3
 
4
  import arxiv
5
+ from loguru import logger
6
  from tenacity import retry, stop_after_attempt, wait_exponential
7
 
8
  from paper_survey_agent.apis.base import BaseScientificAPI
 
10
  from paper_survey_agent.settings import settings
11
 
12
 
 
 
 
13
  class ArxivAPI(BaseScientificAPI):
14
  def __init__(self, page_size: int = settings.ARXIV_PAGE_SIZE, delay_seconds: int = settings.ARXIV_DELAY_SECONDS):
15
  self.page_size = page_size
src/paper_survey_agent/apis/semantic_scholar.py CHANGED
@@ -1,9 +1,9 @@
1
  import asyncio
2
  from datetime import datetime
3
- import logging
4
  from typing import Optional
5
 
6
  import httpx
 
7
  from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
8
 
9
  from paper_survey_agent.apis.base import BaseScientificAPI
@@ -11,9 +11,6 @@ from paper_survey_agent.models.paper import Paper
11
  from paper_survey_agent.settings import settings
12
 
13
 
14
- logger = logging.getLogger(__name__)
15
-
16
-
17
  class SemanticScholarAPI(BaseScientificAPI):
18
  BASE_URL = settings.SEMANTIC_SCHOLAR_API_BASE_URL
19
 
@@ -48,7 +45,7 @@ class SemanticScholarAPI(BaseScientificAPI):
48
  timeout=timeout,
49
  )
50
 
51
- logger.info(f"Initialized SemanticScholarAPI " f"(authenticated: {bool(api_key)}, timeout: {timeout}s)")
52
 
53
  async def __aenter__(self):
54
  return self
 
1
  import asyncio
2
  from datetime import datetime
 
3
  from typing import Optional
4
 
5
  import httpx
6
+ from loguru import logger
7
  from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
8
 
9
  from paper_survey_agent.apis.base import BaseScientificAPI
 
11
  from paper_survey_agent.settings import settings
12
 
13
 
 
 
 
14
  class SemanticScholarAPI(BaseScientificAPI):
15
  BASE_URL = settings.SEMANTIC_SCHOLAR_API_BASE_URL
16
 
 
45
  timeout=timeout,
46
  )
47
 
48
+ logger.info(f"Initialized SemanticScholarAPI (authenticated: {bool(api_key)}, timeout: {timeout}s)")
49
 
50
  async def __aenter__(self):
51
  return self
src/paper_survey_agent/llm/client.py CHANGED
@@ -1,16 +1,13 @@
1
- import logging
2
  import os
3
  from typing import Any
4
 
5
  from litellm import completion
 
6
  from tenacity import retry, stop_after_attempt, wait_exponential
7
 
8
  from paper_survey_agent.settings import settings
9
 
10
 
11
- logger = logging.getLogger(__name__)
12
-
13
-
14
  class LLMClient:
15
  def __init__(self):
16
  self.model = settings.LLM_MODEL
 
 
1
  import os
2
  from typing import Any
3
 
4
  from litellm import completion
5
+ from loguru import logger
6
  from tenacity import retry, stop_after_attempt, wait_exponential
7
 
8
  from paper_survey_agent.settings import settings
9
 
10
 
 
 
 
11
  class LLMClient:
12
  def __init__(self):
13
  self.model = settings.LLM_MODEL
src/paper_survey_agent/models/__init__.py CHANGED
@@ -2,4 +2,5 @@
2
 
3
  from .paper import Paper
4
 
 
5
  __all__ = ["Paper"]
 
2
 
3
  from .paper import Paper
4
 
5
+
6
  __all__ = ["Paper"]
src/paper_survey_agent/settings.py CHANGED
@@ -1,4 +1,3 @@
1
- import os
2
  from pathlib import Path
3
 
4
  from pydantic_settings import BaseSettings, SettingsConfigDict
 
 
1
  from pathlib import Path
2
 
3
  from pydantic_settings import BaseSettings, SettingsConfigDict
src/paper_survey_agent/tools/generate_search_query/generate_search_query.py CHANGED
@@ -1,12 +1,9 @@
1
- import logging
2
 
3
  from paper_survey_agent.llm.client import llm_client
4
  from paper_survey_agent.llm.prompts import SEARCH_QUERY_REFINEMENT_SYSTEM_PROMPT
5
 
6
 
7
- logger = logging.getLogger(__name__)
8
-
9
-
10
  def generate_search_query(user_query: str) -> str:
11
  logger.info(f"🧠 Refining user query: '{user_query}'")
12
 
 
1
+ from loguru import logger
2
 
3
  from paper_survey_agent.llm.client import llm_client
4
  from paper_survey_agent.llm.prompts import SEARCH_QUERY_REFINEMENT_SYSTEM_PROMPT
5
 
6
 
 
 
 
7
  def generate_search_query(user_query: str) -> str:
8
  logger.info(f"🧠 Refining user query: '{user_query}'")
9
 
src/paper_survey_agent/tools/search_and_load_papers_txt/search_and_load_papers_txt.py CHANGED
@@ -1,7 +1,8 @@
1
  import asyncio
2
- import logging
3
  import math
4
 
 
 
5
  from paper_survey_agent.models.paper import ProcessedPaper
6
  from paper_survey_agent.settings import settings
7
  from paper_survey_agent.tools.search_and_load_papers_txt.utils.downloader import download_papers
@@ -11,9 +12,6 @@ from paper_survey_agent.tools.search_and_load_papers_txt.utils.retrieval import
11
  from paper_survey_agent.tools.search_and_load_papers_txt.utils.text_extractor import convert_pdfs_to_text
12
 
13
 
14
- logger = logging.getLogger(__name__)
15
-
16
-
17
  async def search_and_load_papers_txt(query: str) -> list[ProcessedPaper]:
18
  clear_data_directory()
19
 
@@ -45,7 +43,7 @@ async def search_and_load_papers_txt(query: str) -> list[ProcessedPaper]:
45
  current_index = batch_end
46
 
47
  if len(downloaded_pdfs) < target_count:
48
- logger.warning(f"Pipeline finished with {len(downloaded_pdfs)} papers, " f"short of target {target_count}.")
49
 
50
  txt_paths = await convert_pdfs_to_text(downloaded_pdfs)
51
 
 
1
  import asyncio
 
2
  import math
3
 
4
+ from loguru import logger
5
+
6
  from paper_survey_agent.models.paper import ProcessedPaper
7
  from paper_survey_agent.settings import settings
8
  from paper_survey_agent.tools.search_and_load_papers_txt.utils.downloader import download_papers
 
12
  from paper_survey_agent.tools.search_and_load_papers_txt.utils.text_extractor import convert_pdfs_to_text
13
 
14
 
 
 
 
15
  async def search_and_load_papers_txt(query: str) -> list[ProcessedPaper]:
16
  clear_data_directory()
17
 
 
43
  current_index = batch_end
44
 
45
  if len(downloaded_pdfs) < target_count:
46
+ logger.warning(f"Pipeline finished with {len(downloaded_pdfs)} papers, short of target {target_count}.")
47
 
48
  txt_paths = await convert_pdfs_to_text(downloaded_pdfs)
49
 
src/paper_survey_agent/tools/search_and_load_papers_txt/utils/downloader.py CHANGED
@@ -1,19 +1,16 @@
1
  import asyncio
2
- import logging
3
  from pathlib import Path
4
  import re
5
  from typing import Optional
6
 
7
  import httpx
 
8
  from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
9
 
10
  from paper_survey_agent.models.paper import Paper
11
  from paper_survey_agent.settings import settings
12
 
13
 
14
- logger = logging.getLogger(__name__)
15
-
16
-
17
  async def download_papers(papers: list[Paper], destination_dir: str | Path | None = None) -> dict[str, Path]:
18
  if destination_dir is None:
19
  destination_dir = Path(settings.DATA_DIR) / "pdfs"
 
1
  import asyncio
 
2
  from pathlib import Path
3
  import re
4
  from typing import Optional
5
 
6
  import httpx
7
+ from loguru import logger
8
  from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
9
 
10
  from paper_survey_agent.models.paper import Paper
11
  from paper_survey_agent.settings import settings
12
 
13
 
 
 
 
14
  async def download_papers(papers: list[Paper], destination_dir: str | Path | None = None) -> dict[str, Path]:
15
  if destination_dir is None:
16
  destination_dir = Path(settings.DATA_DIR) / "pdfs"
src/paper_survey_agent/tools/search_and_load_papers_txt/utils/ranking.py CHANGED
@@ -1,18 +1,15 @@
1
  from collections import Counter
2
  from datetime import datetime
3
- import logging
4
  import re
5
  from typing import Optional
6
 
 
7
  from rapidfuzz import fuzz
8
 
9
  from paper_survey_agent.models.paper import Paper
10
  from paper_survey_agent.settings import settings
11
 
12
 
13
- logger = logging.getLogger(__name__)
14
-
15
-
16
  def rank_and_deduplicate(
17
  papers: list[Paper],
18
  topic: str,
@@ -42,7 +39,7 @@ def rank_and_deduplicate(
42
  if scored_papers:
43
  logger.info(
44
  f"Returning top {len(top_papers)} papers. "
45
- f"Score range: {scored_papers[0][1]:.3f} - {scored_papers[min(top_k-1, len(scored_papers)-1)][1]:.3f}"
46
  )
47
 
48
  return top_papers
@@ -64,9 +61,7 @@ def _deduplicate_papers(papers: list[Paper], fuzzy_threshold: int) -> list[Paper
64
  for seen_title, seen_paper in seen_titles:
65
  similarity = fuzz.ratio(normalized_title, seen_title)
66
  if similarity >= fuzzy_threshold:
67
- logger.debug(
68
- f"Fuzzy duplicate found ({similarity}% similar): " f"'{paper.title}' ≈ '{seen_paper.title}'"
69
- )
70
  if paper.citations_count and not seen_paper.citations_count:
71
  deduplicated.remove(seen_paper)
72
  seen_ids.remove(seen_paper.id)
 
1
  from collections import Counter
2
  from datetime import datetime
 
3
  import re
4
  from typing import Optional
5
 
6
+ from loguru import logger
7
  from rapidfuzz import fuzz
8
 
9
  from paper_survey_agent.models.paper import Paper
10
  from paper_survey_agent.settings import settings
11
 
12
 
 
 
 
13
  def rank_and_deduplicate(
14
  papers: list[Paper],
15
  topic: str,
 
39
  if scored_papers:
40
  logger.info(
41
  f"Returning top {len(top_papers)} papers. "
42
+ f"Score range: {scored_papers[0][1]:.3f} - {scored_papers[min(top_k - 1, len(scored_papers) - 1)][1]:.3f}"
43
  )
44
 
45
  return top_papers
 
61
  for seen_title, seen_paper in seen_titles:
62
  similarity = fuzz.ratio(normalized_title, seen_title)
63
  if similarity >= fuzzy_threshold:
64
+ logger.debug(f"Fuzzy duplicate found ({similarity}% similar): '{paper.title}' ≈ '{seen_paper.title}'")
 
 
65
  if paper.citations_count and not seen_paper.citations_count:
66
  deduplicated.remove(seen_paper)
67
  seen_ids.remove(seen_paper.id)
src/paper_survey_agent/tools/search_and_load_papers_txt/utils/retrieval.py CHANGED
@@ -1,16 +1,14 @@
1
  import asyncio
2
- import logging
3
  import os
4
  from typing import Optional
5
 
 
 
6
  from paper_survey_agent.apis import ArxivAPI, SemanticScholarAPI
7
  from paper_survey_agent.models.paper import Paper
8
  from paper_survey_agent.settings import settings
9
 
10
 
11
- logger = logging.getLogger(__name__)
12
-
13
-
14
  async def retrieve_papers(
15
  query: str,
16
  sources: list[str] | None = None,
@@ -27,7 +25,7 @@ async def retrieve_papers(
27
  if semantic_scholar_api_key:
28
  logger.info("Using Semantic Scholar API key from environment")
29
 
30
- logger.info(f"Retrieving papers: query='{query}', sources={sources}, " f"max_per_source={max_results_per_source}")
31
 
32
  tasks = []
33
 
@@ -66,9 +64,7 @@ async def retrieve_papers(
66
  )
67
 
68
  if not all_papers:
69
- raise ValueError(
70
- f"Failed to retrieve papers from all sources. " f"Attempted: {sources}, Failed: {failed_sources}"
71
- )
72
 
73
  return all_papers
74
 
@@ -183,7 +179,7 @@ async def _fetch_from_semantic_scholar(
183
  )
184
  else:
185
  logger.info(
186
- f"Semantic Scholar returned {len(papers_with_pdf)} papers with PDF " f"(target: {max_results})"
187
  )
188
 
189
  return papers_with_pdf
 
1
  import asyncio
 
2
  import os
3
  from typing import Optional
4
 
5
+ from loguru import logger
6
+
7
  from paper_survey_agent.apis import ArxivAPI, SemanticScholarAPI
8
  from paper_survey_agent.models.paper import Paper
9
  from paper_survey_agent.settings import settings
10
 
11
 
 
 
 
12
  async def retrieve_papers(
13
  query: str,
14
  sources: list[str] | None = None,
 
25
  if semantic_scholar_api_key:
26
  logger.info("Using Semantic Scholar API key from environment")
27
 
28
+ logger.info(f"Retrieving papers: query='{query}', sources={sources}, max_per_source={max_results_per_source}")
29
 
30
  tasks = []
31
 
 
64
  )
65
 
66
  if not all_papers:
67
+ raise ValueError(f"Failed to retrieve papers from all sources. Attempted: {sources}, Failed: {failed_sources}")
 
 
68
 
69
  return all_papers
70
 
 
179
  )
180
  else:
181
  logger.info(
182
+ f"Semantic Scholar returned {len(papers_with_pdf)} papers with PDF (target: {max_results})"
183
  )
184
 
185
  return papers_with_pdf
src/paper_survey_agent/tools/search_and_load_papers_txt/utils/text_extractor.py CHANGED
@@ -1,16 +1,13 @@
1
  import asyncio
2
- import logging
3
  from pathlib import Path
4
 
5
  import aiofiles
 
6
  import pymupdf
7
 
8
  from paper_survey_agent.settings import settings
9
 
10
 
11
- logger = logging.getLogger(__name__)
12
-
13
-
14
  def extract_text_from_pdf_sync(pdf_path: Path) -> str:
15
  try:
16
  with pymupdf.open(pdf_path) as doc:
 
1
  import asyncio
 
2
  from pathlib import Path
3
 
4
  import aiofiles
5
+ from loguru import logger
6
  import pymupdf
7
 
8
  from paper_survey_agent.settings import settings
9
 
10
 
 
 
 
11
  def extract_text_from_pdf_sync(pdf_path: Path) -> str:
12
  try:
13
  with pymupdf.open(pdf_path) as doc:
src/paper_survey_agent/tools/summarize_papers/summarize_papers.py CHANGED
@@ -1,18 +1,15 @@
1
  import asyncio
2
  import json
3
- import logging
4
  from typing import Any
5
 
6
  import aiofiles
 
7
 
8
  from paper_survey_agent.llm.client import llm_client
9
  from paper_survey_agent.llm.prompts import PAPER_SUMMARIZATION_SYSTEM_PROMPT
10
  from paper_survey_agent.models.paper import ProcessedPaper, SummarizedPaper
11
 
12
 
13
- logger = logging.getLogger(__name__)
14
-
15
-
16
  def parse_llm_json(text: str) -> dict[str, Any]:
17
  cleaned = text.strip()
18
 
 
1
  import asyncio
2
  import json
 
3
  from typing import Any
4
 
5
  import aiofiles
6
+ from loguru import logger
7
 
8
  from paper_survey_agent.llm.client import llm_client
9
  from paper_survey_agent.llm.prompts import PAPER_SUMMARIZATION_SYSTEM_PROMPT
10
  from paper_survey_agent.models.paper import ProcessedPaper, SummarizedPaper
11
 
12
 
 
 
 
13
  def parse_llm_json(text: str) -> dict[str, Any]:
14
  cleaned = text.strip()
15
 
src/paper_survey_agent/tools/synthesize_survey/synthesize_survey.py CHANGED
@@ -1,14 +1,12 @@
1
  import asyncio
2
- import logging
 
3
 
4
  from paper_survey_agent.llm.client import llm_client
5
  from paper_survey_agent.llm.prompts import SURVEY_SYNTHESIS_SYSTEM_PROMPT
6
  from paper_survey_agent.models.paper import SummarizedPaper
7
 
8
 
9
- logger = logging.getLogger(__name__)
10
-
11
-
12
  def format_papers_for_synthesis(papers: list[SummarizedPaper]) -> str:
13
  context_parts = []
14
 
 
1
  import asyncio
2
+
3
+ from loguru import logger
4
 
5
  from paper_survey_agent.llm.client import llm_client
6
  from paper_survey_agent.llm.prompts import SURVEY_SYNTHESIS_SYSTEM_PROMPT
7
  from paper_survey_agent.models.paper import SummarizedPaper
8
 
9
 
 
 
 
10
  def format_papers_for_synthesis(papers: list[SummarizedPaper]) -> str:
11
  context_parts = []
12
 
uv.lock CHANGED
The diff for this file is too large to render. See raw diff