Spaces:
Sleeping
Sleeping
Commit
Β·
0a40afa
1
Parent(s):
0b7edde
Update Dockerfile to use new app entry point and enhance requirements.txt with additional dependencies. Remove obsolete streamlit_app.py file.
Browse filesThis view is limited to 50 files because it contains too many changes. Β
See raw diff
- .DS_Store +0 -0
- .env +7 -0
- .vscode/launch.json +13 -0
- Dockerfile +1 -1
- requirements.txt +7 -1
- src/README.md +167 -0
- src/agents/__init__.py +1 -0
- src/agents/__pycache__/__init__.cpython-312.pyc +0 -0
- src/agents/__pycache__/__init__.cpython-313.pyc +0 -0
- src/agents/__pycache__/base_agent.cpython-312.pyc +0 -0
- src/agents/__pycache__/base_agent.cpython-313.pyc +0 -0
- src/agents/__pycache__/cache_agent.cpython-313.pyc +0 -0
- src/agents/__pycache__/confidence_scorer.cpython-312.pyc +0 -0
- src/agents/__pycache__/confidence_scorer.cpython-313.pyc +0 -0
- src/agents/__pycache__/document_intelligence_agent.cpython-313.pyc +0 -0
- src/agents/__pycache__/field_mapper_agent.cpython-312.pyc +0 -0
- src/agents/__pycache__/field_mapper_agent.cpython-313.pyc +0 -0
- src/agents/__pycache__/index_agent.cpython-312.pyc +0 -0
- src/agents/__pycache__/index_agent.cpython-313.pyc +0 -0
- src/agents/__pycache__/pdf_agent.cpython-312.pyc +0 -0
- src/agents/__pycache__/pdf_agent.cpython-313.pyc +0 -0
- src/agents/__pycache__/query_generator.cpython-312.pyc +0 -0
- src/agents/__pycache__/query_generator.cpython-313.pyc +0 -0
- src/agents/__pycache__/semantic_reasoner.cpython-312.pyc +0 -0
- src/agents/__pycache__/semantic_reasoner.cpython-313.pyc +0 -0
- src/agents/__pycache__/table_agent.cpython-312.pyc +0 -0
- src/agents/__pycache__/table_agent.cpython-313.pyc +0 -0
- src/agents/base_agent.py +8 -0
- src/agents/confidence_scorer.py +6 -0
- src/agents/field_mapper_agent.py +311 -0
- src/agents/index_agent.py +135 -0
- src/agents/pdf_agent.py +28 -0
- src/agents/query_generator.py +7 -0
- src/agents/semantic_reasoner.py +8 -0
- src/agents/table_agent.py +46 -0
- src/app.py +364 -0
- src/config/__init__.py +1 -0
- src/config/__pycache__/__init__.cpython-312.pyc +0 -0
- src/config/__pycache__/__init__.cpython-313.pyc +0 -0
- src/config/__pycache__/configurations.cpython-313.pyc +0 -0
- src/config/__pycache__/settings.cpython-312.pyc +0 -0
- src/config/__pycache__/settings.cpython-313.pyc +0 -0
- src/config/field_rules.yaml +4 -0
- src/config/prompts.yaml +53 -0
- src/config/settings.py +18 -0
- src/docker/Dockerfile +21 -0
- src/docker/entrypoint.sh +3 -0
- src/main.py +6 -0
- src/orchestrator/__init__.py +1 -0
- src/orchestrator/__pycache__/__init__.cpython-312.pyc +0 -0
.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
.env
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
AZURE_DI_ENDPOINT=https://luisvalenciadi.cognitiveservices.azure.com/
|
2 |
+
AZURE_DI_KEY=2MMHWymCHlykFwMGtaDHKGcbcWFpMfF29vqWKV6C584Zi9EJj5m4JQQJ99BEACfhMk5XJ3w3AAALACOGUYF3
|
3 |
+
OPENAI_API_KEY=1bW5tUhsr8yHXQe0Pra6YdM43fWCUr06KtbGy9gKxVOd7ut5vL9GJQQJ99BDACfhMk5XJ3w3AAAAACOGTQtA
|
4 |
+
AZURE_OPENAI_ENDPOINT=https://lvaaifoundry1567796533.openai.azure.com/
|
5 |
+
AZURE_OPENAI_DEPLOYMENT=gpt-4.1
|
6 |
+
AZURE_OPENAI_API_VERSION=2025-03-01-preview
|
7 |
+
AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small
|
.vscode/launch.json
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"configurations": [
|
3 |
+
{
|
4 |
+
"name": "Streamlit (debug)",
|
5 |
+
"type": "debugpy",
|
6 |
+
"request": "launch",
|
7 |
+
"module": "streamlit",
|
8 |
+
"args": ["run", "src/app.py"],
|
9 |
+
"envFile": "${workspaceFolder}/.env",
|
10 |
+
}
|
11 |
+
]
|
12 |
+
}
|
13 |
+
|
Dockerfile
CHANGED
@@ -18,4 +18,4 @@ EXPOSE 8501
|
|
18 |
|
19 |
HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
|
20 |
|
21 |
-
ENTRYPOINT ["streamlit", "run", "src/
|
|
|
18 |
|
19 |
HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
|
20 |
|
21 |
+
ENTRYPOINT ["streamlit", "run", "src/app.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
requirements.txt
CHANGED
@@ -1,3 +1,9 @@
|
|
1 |
altair
|
2 |
pandas
|
3 |
-
streamlit
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
altair
|
2 |
pandas
|
3 |
+
streamlit
|
4 |
+
pyyaml
|
5 |
+
python-dotenv
|
6 |
+
openai
|
7 |
+
pydantic-settings
|
8 |
+
PyMuPDF
|
9 |
+
azure-ai-documentintelligence
|
src/README.md
ADDED
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Deep-Research PDF Field Extractor
|
2 |
+
|
3 |
+
A powerful tool for extracting structured data from PDF documents, designed to handle various document types and extract specific fields of interest.
|
4 |
+
|
5 |
+
## For End Users
|
6 |
+
|
7 |
+
### Overview
|
8 |
+
The PDF Field Extractor helps you extract specific information from PDF documents. It can extract any fields you specify, such as dates, names, values, locations, and more. The tool is particularly useful for converting unstructured PDF data into structured, analyzable formats.
|
9 |
+
|
10 |
+
### How to Use
|
11 |
+
|
12 |
+
1. **Upload Your PDF**
|
13 |
+
- Click the "Upload PDF" button
|
14 |
+
- Select your PDF file from your computer
|
15 |
+
|
16 |
+
2. **Specify Fields to Extract**
|
17 |
+
- Enter the fields you want to extract, separated by commas
|
18 |
+
- Example: `Date, Name, Value, Location, Page, FileName`
|
19 |
+
|
20 |
+
3. **Optional: Add Field Descriptions**
|
21 |
+
- You can provide additional context about the fields.
|
22 |
+
- This helps the system better understand what to look for
|
23 |
+
|
24 |
+
4. **Run Extraction**
|
25 |
+
- Click the "Run extraction" button
|
26 |
+
- Wait for the process to complete
|
27 |
+
- View your results in a table format
|
28 |
+
|
29 |
+
5. **Download Results**
|
30 |
+
- Download your extracted data as a CSV file
|
31 |
+
- View execution traces and logs if needed
|
32 |
+
|
33 |
+
### Features
|
34 |
+
- Automatic document type detection
|
35 |
+
- Smart field extraction
|
36 |
+
- Support for tables and text
|
37 |
+
- Detailed execution traces
|
38 |
+
- Downloadable results and logs
|
39 |
+
|
40 |
+
## For Developers
|
41 |
+
|
42 |
+
### Architecture Overview
|
43 |
+
|
44 |
+
The application is built using a multi-agent architecture with the following components:
|
45 |
+
|
46 |
+
#### Core Components
|
47 |
+
|
48 |
+
1. **Planner (`orchestrator/planner.py`)**
|
49 |
+
- Generates execution plans using Azure OpenAI
|
50 |
+
|
51 |
+
|
52 |
+
2. **Executor (`orchestrator/executor.py`)**
|
53 |
+
- Executes the generated plan
|
54 |
+
- Manages agent execution flow
|
55 |
+
- Handles context and result management
|
56 |
+
|
57 |
+
3. **Agents**
|
58 |
+
- `PDFAgent`: Extracts text from PDFs
|
59 |
+
- `TableAgent`: Extracts tables from PDFs
|
60 |
+
- `FieldMapper`: Maps fields to values
|
61 |
+
- `ForEachField`: Control flow for field iteration
|
62 |
+
|
63 |
+
### Agent Pipeline
|
64 |
+
|
65 |
+
1. **Document Processing**
|
66 |
+
```python
|
67 |
+
# Document is processed in stages:
|
68 |
+
1. PDF text extraction
|
69 |
+
2. Table extraction
|
70 |
+
3. Field mapping
|
71 |
+
```
|
72 |
+
|
73 |
+
2. **Field Extraction Process**
|
74 |
+
- Document type inference
|
75 |
+
- User profile determination
|
76 |
+
- Page-by-page scanning
|
77 |
+
- Value extraction and validation
|
78 |
+
|
79 |
+
3. **Context Building**
|
80 |
+
- Document metadata
|
81 |
+
- Field descriptions
|
82 |
+
- User context
|
83 |
+
- Execution history
|
84 |
+
|
85 |
+
### Key Features
|
86 |
+
|
87 |
+
#### Document Type Inference
|
88 |
+
The system automatically infers document type and user profile:
|
89 |
+
```python
|
90 |
+
# Example inference:
|
91 |
+
"Document type: Analytical report
|
92 |
+
User profile: Data analysts or researchers working with document analysis"
|
93 |
+
```
|
94 |
+
|
95 |
+
#### Field Mapping
|
96 |
+
The FieldMapper agent uses a sophisticated approach:
|
97 |
+
1. Document context analysis
|
98 |
+
2. Page-by-page scanning
|
99 |
+
3. Value extraction using LLM
|
100 |
+
4. Result validation
|
101 |
+
|
102 |
+
#### Execution Traces
|
103 |
+
The system maintains detailed execution traces:
|
104 |
+
- Tool execution history
|
105 |
+
- Success/failure status
|
106 |
+
- Detailed logs
|
107 |
+
- Result storage
|
108 |
+
|
109 |
+
### Technical Setup
|
110 |
+
|
111 |
+
1. **Dependencies**
|
112 |
+
```python
|
113 |
+
# Key dependencies:
|
114 |
+
- streamlit
|
115 |
+
- pandas
|
116 |
+
- PyMuPDF (fitz)
|
117 |
+
- Azure OpenAI
|
118 |
+
- Azure Document Intelligence
|
119 |
+
```
|
120 |
+
|
121 |
+
2. **Configuration**
|
122 |
+
- Environment variables for API keys
|
123 |
+
- Prompt templates in `config/prompts.yaml`
|
124 |
+
- Settings in `config/settings.py`
|
125 |
+
|
126 |
+
3. **Logging System**
|
127 |
+
```python
|
128 |
+
# Custom logging setup:
|
129 |
+
- LogCaptureHandler for UI display
|
130 |
+
- Structured logging format
|
131 |
+
- Execution history storage
|
132 |
+
```
|
133 |
+
|
134 |
+
### Development Guidelines
|
135 |
+
|
136 |
+
1. **Adding New Agents**
|
137 |
+
- Inherit from base agent class
|
138 |
+
- Implement required methods
|
139 |
+
- Add to planner configuration
|
140 |
+
|
141 |
+
2. **Modifying Extraction Logic**
|
142 |
+
- Update prompt templates
|
143 |
+
- Modify field mapping logic
|
144 |
+
- Adjust validation rules
|
145 |
+
|
146 |
+
3. **Extending Functionality**
|
147 |
+
- Add new field types
|
148 |
+
- Implement custom validators
|
149 |
+
- Create new output formats
|
150 |
+
|
151 |
+
### Testing
|
152 |
+
- Unit tests for agents
|
153 |
+
- Integration tests for pipeline
|
154 |
+
- End-to-end testing with sample PDFs
|
155 |
+
|
156 |
+
### Deployment
|
157 |
+
- Streamlit app deployment
|
158 |
+
- Environment configuration
|
159 |
+
- API key management
|
160 |
+
- Logging setup
|
161 |
+
|
162 |
+
### Future Improvements
|
163 |
+
- Enhanced error handling
|
164 |
+
- Additional field types
|
165 |
+
- Improved validation
|
166 |
+
- Performance optimization
|
167 |
+
- Extended documentation
|
src/agents/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
|
src/agents/__pycache__/__init__.cpython-312.pyc
ADDED
Binary file (173 Bytes). View file
|
|
src/agents/__pycache__/__init__.cpython-313.pyc
ADDED
Binary file (161 Bytes). View file
|
|
src/agents/__pycache__/base_agent.cpython-312.pyc
ADDED
Binary file (757 Bytes). View file
|
|
src/agents/__pycache__/base_agent.cpython-313.pyc
ADDED
Binary file (805 Bytes). View file
|
|
src/agents/__pycache__/cache_agent.cpython-313.pyc
ADDED
Binary file (7.52 kB). View file
|
|
src/agents/__pycache__/confidence_scorer.cpython-312.pyc
ADDED
Binary file (659 Bytes). View file
|
|
src/agents/__pycache__/confidence_scorer.cpython-313.pyc
ADDED
Binary file (707 Bytes). View file
|
|
src/agents/__pycache__/document_intelligence_agent.cpython-313.pyc
ADDED
Binary file (3.07 kB). View file
|
|
src/agents/__pycache__/field_mapper_agent.cpython-312.pyc
ADDED
Binary file (18 kB). View file
|
|
src/agents/__pycache__/field_mapper_agent.cpython-313.pyc
ADDED
Binary file (18.2 kB). View file
|
|
src/agents/__pycache__/index_agent.cpython-312.pyc
ADDED
Binary file (7.63 kB). View file
|
|
src/agents/__pycache__/index_agent.cpython-313.pyc
ADDED
Binary file (7.72 kB). View file
|
|
src/agents/__pycache__/pdf_agent.cpython-312.pyc
ADDED
Binary file (1.71 kB). View file
|
|
src/agents/__pycache__/pdf_agent.cpython-313.pyc
ADDED
Binary file (1.75 kB). View file
|
|
src/agents/__pycache__/query_generator.cpython-312.pyc
ADDED
Binary file (762 Bytes). View file
|
|
src/agents/__pycache__/query_generator.cpython-313.pyc
ADDED
Binary file (810 Bytes). View file
|
|
src/agents/__pycache__/semantic_reasoner.cpython-312.pyc
ADDED
Binary file (902 Bytes). View file
|
|
src/agents/__pycache__/semantic_reasoner.cpython-313.pyc
ADDED
Binary file (960 Bytes). View file
|
|
src/agents/__pycache__/table_agent.cpython-312.pyc
ADDED
Binary file (2.74 kB). View file
|
|
src/agents/__pycache__/table_agent.cpython-313.pyc
ADDED
Binary file (2.81 kB). View file
|
|
src/agents/base_agent.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Abstract base class for every agent."""
|
2 |
+
from abc import ABC, abstractmethod
|
3 |
+
from typing import Dict, Any
|
4 |
+
|
5 |
+
class BaseAgent(ABC):
|
6 |
+
@abstractmethod
|
7 |
+
def execute(self, ctx: Dict[str, Any]):
|
8 |
+
"""Mutate / consume ctx and return a value."""
|
src/agents/confidence_scorer.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, Any
|
2 |
+
from .base_agent import BaseAgent
|
3 |
+
|
4 |
+
class ConfidenceScorer(BaseAgent):
|
5 |
+
def execute(self, ctx: Dict[str, Any]):
|
6 |
+
return 1.0 # always confident for stub
|
src/agents/field_mapper_agent.py
ADDED
@@ -0,0 +1,311 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Map a single field to a candidate value using page-by-page analysis and LLM-based extraction."""
|
2 |
+
from typing import Dict, Any, Optional, List
|
3 |
+
import logging
|
4 |
+
import re
|
5 |
+
import json
|
6 |
+
from .base_agent import BaseAgent
|
7 |
+
from services.llm_client import LLMClient
|
8 |
+
from services.embedding_client import EmbeddingClient
|
9 |
+
from config.settings import settings
|
10 |
+
|
11 |
+
# Configure logging to disable verbose Azure HTTP logs
|
12 |
+
logging.getLogger('azure.core.pipeline.policies.http_logging_policy').setLevel(logging.WARNING)
|
13 |
+
logging.getLogger('azure.core.pipeline').setLevel(logging.WARNING)
|
14 |
+
logging.getLogger('azure').setLevel(logging.WARNING)
|
15 |
+
|
16 |
+
class FieldMapperAgent(BaseAgent):
|
17 |
+
def __init__(self):
|
18 |
+
self.logger = logging.getLogger(__name__)
|
19 |
+
self.llm = LLMClient(settings)
|
20 |
+
self.embedding_client = EmbeddingClient()
|
21 |
+
|
22 |
+
def _infer_document_context(self, text: str) -> str:
|
23 |
+
"""Use LLM to infer document context and user profile."""
|
24 |
+
prompt = f"""Given this document text, describe the document type and typical user profile in 1-2 sentences.
|
25 |
+
Focus on the domain, purpose, and who would use this document.
|
26 |
+
|
27 |
+
Document text:
|
28 |
+
{text[:2000]} # First 2000 chars for context
|
29 |
+
|
30 |
+
Response format:
|
31 |
+
Document type: [type]
|
32 |
+
User profile: [profile]
|
33 |
+
"""
|
34 |
+
|
35 |
+
try:
|
36 |
+
self.logger.info("Inferring document context...")
|
37 |
+
self.logger.debug(f"Using text preview: {text[:500]}...")
|
38 |
+
context = self.llm.responses(prompt, temperature=0.0)
|
39 |
+
self.logger.info(f"Inferred context: {context}")
|
40 |
+
return context
|
41 |
+
except Exception as e:
|
42 |
+
self.logger.error(f"Error inferring context: {str(e)}")
|
43 |
+
return "Generic document user"
|
44 |
+
|
45 |
+
def _find_similar_chunks_search(self, query: str, index: Dict[str, Any], top_k: int = 3) -> List[Dict[str, Any]]:
|
46 |
+
"""Find chunks semantically similar to the query using cosine similarity."""
|
47 |
+
try:
|
48 |
+
self.logger.info(f"Finding similar chunks for query: {query}")
|
49 |
+
self.logger.debug(f"Index contains {len(index['chunks'])} chunks and {len(index['embeddings'])} embeddings")
|
50 |
+
|
51 |
+
# Get query embedding
|
52 |
+
self.logger.debug("Generating embedding for query...")
|
53 |
+
query_embedding = self.embedding_client.embed([query])[0]
|
54 |
+
self.logger.debug(f"Query embedding generated, length: {len(query_embedding)}")
|
55 |
+
|
56 |
+
# Calculate similarities
|
57 |
+
similarities = []
|
58 |
+
for i, (chunk, embedding) in enumerate(zip(index["chunks"], index["embeddings"])):
|
59 |
+
similarity = self._cosine_similarity(query_embedding, embedding)
|
60 |
+
similarities.append((similarity, chunk))
|
61 |
+
self.logger.debug(f"Chunk {i} similarity: {similarity:.3f}")
|
62 |
+
self.logger.debug(f"Chunk {i} preview: {chunk['text'][:100]}...")
|
63 |
+
|
64 |
+
# Sort by similarity and return top k
|
65 |
+
similarities.sort(reverse=True)
|
66 |
+
results = [chunk for _, chunk in similarities[:top_k]]
|
67 |
+
|
68 |
+
# Log top results
|
69 |
+
self.logger.info(f"Found {len(results)} similar chunks")
|
70 |
+
for i, (sim, chunk) in enumerate(similarities[:top_k]):
|
71 |
+
self.logger.info(f"Top {i+1} match (similarity: {sim:.3f}): {chunk['text'][:200]}...")
|
72 |
+
|
73 |
+
return results
|
74 |
+
|
75 |
+
except Exception as e:
|
76 |
+
self.logger.error(f"Error finding similar chunks: {str(e)}", exc_info=True)
|
77 |
+
return []
|
78 |
+
|
79 |
+
def _cosine_similarity(self, a: List[float], b: List[float]) -> float:
|
80 |
+
"""Calculate cosine similarity between two vectors."""
|
81 |
+
import numpy as np
|
82 |
+
try:
|
83 |
+
# Check for zero vectors
|
84 |
+
if not a or not b or all(x == 0 for x in a) or all(x == 0 for x in b):
|
85 |
+
self.logger.warning("Zero vector detected in cosine similarity calculation")
|
86 |
+
return 0.0
|
87 |
+
|
88 |
+
# Convert to numpy arrays
|
89 |
+
a_np = np.array(a)
|
90 |
+
b_np = np.array(b)
|
91 |
+
|
92 |
+
# Calculate norms
|
93 |
+
norm_a = np.linalg.norm(a_np)
|
94 |
+
norm_b = np.linalg.norm(b_np)
|
95 |
+
|
96 |
+
# Check for zero norms
|
97 |
+
if norm_a == 0 or norm_b == 0:
|
98 |
+
self.logger.warning("Zero norm detected in cosine similarity calculation")
|
99 |
+
return 0.0
|
100 |
+
|
101 |
+
# Calculate similarity
|
102 |
+
similarity = np.dot(a_np, b_np) / (norm_a * norm_b)
|
103 |
+
|
104 |
+
# Check for NaN
|
105 |
+
if np.isnan(similarity):
|
106 |
+
self.logger.warning("NaN detected in cosine similarity calculation")
|
107 |
+
return 0.0
|
108 |
+
|
109 |
+
return float(similarity)
|
110 |
+
except Exception as e:
|
111 |
+
self.logger.error(f"Error calculating cosine similarity: {str(e)}")
|
112 |
+
return 0.0
|
113 |
+
|
114 |
+
def _extract_field_value_search(self, field: str, chunks: List[Dict[str, Any]], context: str) -> Optional[str]:
|
115 |
+
"""Use LLM to extract field value from relevant chunks."""
|
116 |
+
# Combine chunks into context
|
117 |
+
chunk_texts = [chunk["text"] for chunk in chunks]
|
118 |
+
combined_context = "\n".join(chunk_texts)
|
119 |
+
|
120 |
+
self.logger.info(f"Extracting value for field '{field}' from {len(chunks)} chunks")
|
121 |
+
self.logger.debug(f"Combined context preview: {combined_context[:500]}...")
|
122 |
+
|
123 |
+
# Get filename from context if available
|
124 |
+
filename = self.ctx.get("pdf_meta", {}).get("filename", "")
|
125 |
+
filename_context = f"\nDocument filename: {filename}" if filename else ""
|
126 |
+
|
127 |
+
prompt = f"""You are an expert in {context}
|
128 |
+
|
129 |
+
Your task is to extract the value for the field: {field}{filename_context}
|
130 |
+
|
131 |
+
Consider the following context from the document:
|
132 |
+
{combined_context}
|
133 |
+
|
134 |
+
Instructions:
|
135 |
+
1. Look for the field value in the context
|
136 |
+
2. If you find multiple potential values, choose the most relevant one
|
137 |
+
3. If you're not sure, return None
|
138 |
+
4. Return ONLY the value, no explanations
|
139 |
+
|
140 |
+
Field value:"""
|
141 |
+
|
142 |
+
try:
|
143 |
+
self.logger.info(f"Calling LLM to extract value for field '{field}'")
|
144 |
+
self.logger.debug(f"Using prompt: {prompt}")
|
145 |
+
value = self.llm.responses(prompt, temperature=0.0)
|
146 |
+
self.logger.debug(f"Raw LLM response: {value}")
|
147 |
+
|
148 |
+
if value and value.lower() not in ["none", "null", "n/a"]:
|
149 |
+
self.logger.info(f"Successfully extracted value: {value}")
|
150 |
+
return value.strip()
|
151 |
+
else:
|
152 |
+
self.logger.warning(f"LLM returned no valid value for field '{field}'")
|
153 |
+
return None
|
154 |
+
except Exception as e:
|
155 |
+
self.logger.error(f"Error extracting field value: {str(e)}", exc_info=True)
|
156 |
+
return None
|
157 |
+
|
158 |
+
def _extract_field_value_from_page(self, field: str, page_text: str, context: str) -> Optional[str]:
|
159 |
+
"""Use LLM to extract field value from a single page."""
|
160 |
+
self.logger.info(f"Extracting value for field '{field}' from page")
|
161 |
+
self.logger.debug(f"Page text preview: {page_text[:500]}...")
|
162 |
+
|
163 |
+
# Get filename from context if available
|
164 |
+
filename = self.ctx.get("pdf_meta", {}).get("filename", "")
|
165 |
+
filename_context = f"\nDocument filename: {filename}" if filename else ""
|
166 |
+
|
167 |
+
prompt = f"""You are an expert in {context}
|
168 |
+
|
169 |
+
Your task is to extract the value for the field: {field}{filename_context}
|
170 |
+
|
171 |
+
Consider the following page from the document:
|
172 |
+
{page_text}
|
173 |
+
|
174 |
+
Instructions:
|
175 |
+
1. Look for the field values in this page
|
176 |
+
2. Return the data in a tabular format where each field is a column
|
177 |
+
3. Each field should have an array of values
|
178 |
+
4. The arrays must be aligned (same length) to represent rows
|
179 |
+
5. Return ONLY the JSON value, no explanations
|
180 |
+
6. Format the response as a valid JSON object with field names as keys
|
181 |
+
7. Keep the structure flat - do not nest values under 'details' or other keys
|
182 |
+
|
183 |
+
Example response format:
|
184 |
+
{{
|
185 |
+
"field1": ["value1", "value2", "value3"],
|
186 |
+
"field2": ["value4", "value5", "value6"],
|
187 |
+
"field3": ["value7", "value8", "value9"]
|
188 |
+
}}
|
189 |
+
|
190 |
+
Field value:"""
|
191 |
+
|
192 |
+
try:
|
193 |
+
self.logger.info(f"Calling LLM to extract value for field '{field}' from page")
|
194 |
+
value = self.llm.responses(prompt, temperature=0.0)
|
195 |
+
self.logger.debug(f"Raw LLM response: {value}")
|
196 |
+
|
197 |
+
if value and value.lower() not in ["none", "null", "n/a"]:
|
198 |
+
# Try to parse as JSON to ensure it's valid
|
199 |
+
try:
|
200 |
+
json_value = json.loads(value)
|
201 |
+
self.logger.info(f"Successfully extracted value: {json.dumps(json_value, indent=2)}")
|
202 |
+
return json.dumps(json_value, indent=2)
|
203 |
+
except json.JSONDecodeError:
|
204 |
+
# If not valid JSON, wrap it in a JSON object
|
205 |
+
json_value = {field: value.strip()}
|
206 |
+
self.logger.info(f"Wrapped non-JSON value in JSON object: {json.dumps(json_value, indent=2)}")
|
207 |
+
return json.dumps(json_value, indent=2)
|
208 |
+
else:
|
209 |
+
self.logger.warning(f"LLM returned no valid value for field '{field}'")
|
210 |
+
return None
|
211 |
+
except Exception as e:
|
212 |
+
self.logger.error(f"Error extracting field value from page: {str(e)}", exc_info=True)
|
213 |
+
return None
|
214 |
+
|
215 |
+
def execute(self, ctx: Dict[str, Any]): # noqa: D401
|
216 |
+
field = ctx.get("current_field")
|
217 |
+
self.logger.info(f"Starting field mapping for: {field}")
|
218 |
+
|
219 |
+
# Store context for use in extraction methods
|
220 |
+
self.ctx = ctx
|
221 |
+
|
222 |
+
# Get text and index
|
223 |
+
text = ""
|
224 |
+
index = None
|
225 |
+
if "index" in ctx and isinstance(ctx["index"], dict):
|
226 |
+
index = ctx["index"]
|
227 |
+
text = index.get("text", "")
|
228 |
+
self.logger.info(f"Using text from index (length: {len(text)})")
|
229 |
+
self.logger.debug(f"Index contains {len(index.get('chunks', []))} chunks")
|
230 |
+
self.logger.debug(f"Index contains {len(index.get('embeddings', []))} embeddings")
|
231 |
+
elif "text" in ctx:
|
232 |
+
text = ctx["text"]
|
233 |
+
self.logger.info(f"Using text from direct context (length: {len(text)})")
|
234 |
+
|
235 |
+
if not field:
|
236 |
+
self.logger.warning("No field provided in context")
|
237 |
+
return None
|
238 |
+
if not text:
|
239 |
+
self.logger.warning("No text content found in context or index")
|
240 |
+
return None
|
241 |
+
|
242 |
+
# Infer document context if not already present
|
243 |
+
if "document_context" not in ctx:
|
244 |
+
ctx["document_context"] = self._infer_document_context(text)
|
245 |
+
|
246 |
+
self.logger.info(f"Processing field: {field}")
|
247 |
+
self.logger.info(f"Using document context: {ctx['document_context']}")
|
248 |
+
|
249 |
+
# Split text into pages using the same markers as IndexAgent
|
250 |
+
page_markers = [
|
251 |
+
'<!-- PageBreak -->',
|
252 |
+
'Page',
|
253 |
+
'---',
|
254 |
+
'\n\n', # Double newline as fallback
|
255 |
+
]
|
256 |
+
|
257 |
+
# First try to find which marker is used
|
258 |
+
used_marker = None
|
259 |
+
for marker in page_markers:
|
260 |
+
if marker in text:
|
261 |
+
used_marker = marker
|
262 |
+
self.logger.info(f"Found page marker: {marker}")
|
263 |
+
break
|
264 |
+
|
265 |
+
if not used_marker:
|
266 |
+
self.logger.warning("No page markers found, falling back to double newline")
|
267 |
+
used_marker = '\n\n'
|
268 |
+
|
269 |
+
# Split the text
|
270 |
+
if used_marker == '\n\n':
|
271 |
+
pages = [p.strip() for p in text.split(used_marker) if p.strip()]
|
272 |
+
else:
|
273 |
+
pages = []
|
274 |
+
current_page = []
|
275 |
+
|
276 |
+
for line in text.split('\n'):
|
277 |
+
if used_marker in line:
|
278 |
+
if current_page:
|
279 |
+
pages.append('\n'.join(current_page))
|
280 |
+
current_page = [line]
|
281 |
+
else:
|
282 |
+
current_page.append(line)
|
283 |
+
|
284 |
+
if current_page:
|
285 |
+
pages.append('\n'.join(current_page))
|
286 |
+
|
287 |
+
self.logger.info(f"Split document into {len(pages)} pages")
|
288 |
+
|
289 |
+
# Process each page
|
290 |
+
for i, page in enumerate(pages, 1):
|
291 |
+
self.logger.info(f"Processing page {i}/{len(pages)}")
|
292 |
+
value = self._extract_field_value_from_page(field, page, ctx["document_context"])
|
293 |
+
if value:
|
294 |
+
return value
|
295 |
+
|
296 |
+
# If no value found in any page, try the search-based approach as fallback
|
297 |
+
self.logger.warning("No value found in page-by-page analysis, falling back to search-based approach")
|
298 |
+
|
299 |
+
if index and "embeddings" in index:
|
300 |
+
self.logger.info("Using semantic search with embeddings")
|
301 |
+
search_query = f"{field} in {ctx['document_context']}"
|
302 |
+
similar_chunks = self._find_similar_chunks_search(search_query, index)
|
303 |
+
|
304 |
+
if similar_chunks:
|
305 |
+
self.logger.info(f"Found {len(similar_chunks)} relevant chunks, attempting value extraction")
|
306 |
+
value = self._extract_field_value_search(field, similar_chunks, ctx["document_context"])
|
307 |
+
if value:
|
308 |
+
return value
|
309 |
+
|
310 |
+
self.logger.warning(f"No candidate found for field: {field}")
|
311 |
+
return f"<no candidate for {field}>"
|
src/agents/index_agent.py
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Create a semantic index of document content using embeddings."""
|
2 |
+
from typing import Dict, Any, List, Tuple
|
3 |
+
import logging
|
4 |
+
import numpy as np
|
5 |
+
from .base_agent import BaseAgent
|
6 |
+
from services.embedding_client import EmbeddingClient
|
7 |
+
|
8 |
+
class IndexAgent(BaseAgent):
|
9 |
+
def __init__(self):
|
10 |
+
self.logger = logging.getLogger(__name__)
|
11 |
+
self.embedding_client = EmbeddingClient()
|
12 |
+
self.logger.info("IndexAgent initialized")
|
13 |
+
|
14 |
+
def execute(self, ctx: Dict[str, Any]):
|
15 |
+
"""Create a semantic index of document content."""
|
16 |
+
try:
|
17 |
+
self.logger.info("Starting index creation")
|
18 |
+
|
19 |
+
# Get text from PDF agent
|
20 |
+
text = ctx.get("text", "")
|
21 |
+
if not text:
|
22 |
+
self.logger.warning("No text content found in context")
|
23 |
+
return {}
|
24 |
+
self.logger.info(f"Found text content of length {len(text)}")
|
25 |
+
|
26 |
+
# Get tables from Table agent
|
27 |
+
tables = ctx.get("tables", [])
|
28 |
+
self.logger.info(f"Found {len(tables)} tables in context")
|
29 |
+
|
30 |
+
# Combine all content
|
31 |
+
all_content = text
|
32 |
+
if tables:
|
33 |
+
all_content += "\n".join(tables)
|
34 |
+
self.logger.info(f"Combined content length: {len(all_content)}")
|
35 |
+
|
36 |
+
# Create chunks with metadata
|
37 |
+
chunks = self._create_chunks(all_content)
|
38 |
+
self.logger.info(f"Created {len(chunks)} content chunks")
|
39 |
+
for i, chunk in enumerate(chunks):
|
40 |
+
self.logger.debug(f"Chunk {i}: {chunk['text'][:100]}...")
|
41 |
+
|
42 |
+
# Get embeddings for chunks
|
43 |
+
chunk_texts = [chunk["text"] for chunk in chunks]
|
44 |
+
self.logger.info(f"Getting embeddings for {len(chunk_texts)} chunks")
|
45 |
+
embeddings = self.embedding_client.embed(chunk_texts)
|
46 |
+
self.logger.info(f"Generated {len(embeddings)} embeddings")
|
47 |
+
|
48 |
+
# Create semantic index
|
49 |
+
index = {
|
50 |
+
"chunks": chunks,
|
51 |
+
"embeddings": embeddings,
|
52 |
+
"text": all_content, # Keep full text for non-semantic search
|
53 |
+
}
|
54 |
+
|
55 |
+
# Store in context
|
56 |
+
ctx["index"] = index
|
57 |
+
self.logger.info(f"Created semantic index with {len(chunks)} chunks")
|
58 |
+
return index
|
59 |
+
|
60 |
+
except Exception as e:
|
61 |
+
self.logger.error(f"Error in IndexAgent: {str(e)}", exc_info=True)
|
62 |
+
return {}
|
63 |
+
|
64 |
+
def _create_chunks(self, text: str, chunk_size: int = 1000) -> List[Dict[str, Any]]:
|
65 |
+
"""Split text into chunks with metadata."""
|
66 |
+
self.logger.info(f"Creating chunks from text of length {len(text)}")
|
67 |
+
chunks = []
|
68 |
+
sentences = text.split(". ")
|
69 |
+
self.logger.info(f"Split into {len(sentences)} sentences")
|
70 |
+
current_chunk = []
|
71 |
+
current_size = 0
|
72 |
+
total_length = 0
|
73 |
+
|
74 |
+
for sentence in sentences:
|
75 |
+
sentence = sentence.strip() + ". "
|
76 |
+
sentence_size = len(sentence)
|
77 |
+
|
78 |
+
if current_size + sentence_size > chunk_size and current_chunk:
|
79 |
+
# Save current chunk
|
80 |
+
chunk_text = "".join(current_chunk)
|
81 |
+
chunks.append({
|
82 |
+
"text": chunk_text,
|
83 |
+
"start": total_length,
|
84 |
+
"end": total_length + len(chunk_text),
|
85 |
+
"type": "text"
|
86 |
+
})
|
87 |
+
total_length += len(chunk_text)
|
88 |
+
self.logger.debug(f"Created chunk of size {len(chunk_text)}")
|
89 |
+
current_chunk = []
|
90 |
+
current_size = 0
|
91 |
+
|
92 |
+
current_chunk.append(sentence)
|
93 |
+
current_size += sentence_size
|
94 |
+
|
95 |
+
# Add last chunk if any
|
96 |
+
if current_chunk:
|
97 |
+
chunk_text = "".join(current_chunk)
|
98 |
+
chunks.append({
|
99 |
+
"text": chunk_text,
|
100 |
+
"start": total_length,
|
101 |
+
"end": total_length + len(chunk_text),
|
102 |
+
"type": "text"
|
103 |
+
})
|
104 |
+
self.logger.debug(f"Created final chunk of size {len(chunk_text)}")
|
105 |
+
|
106 |
+
self.logger.info(f"Created {len(chunks)} total chunks")
|
107 |
+
return chunks
|
108 |
+
|
109 |
+
def find_similar_chunks(self, query: str, index: Dict[str, Any], top_k: int = 3) -> List[Dict[str, Any]]:
|
110 |
+
"""Find chunks semantically similar to the query."""
|
111 |
+
try:
|
112 |
+
self.logger.info(f"Finding similar chunks for query: {query}")
|
113 |
+
# Get query embedding
|
114 |
+
query_embedding = self.embedding_client.embed([query])[0]
|
115 |
+
|
116 |
+
# Calculate similarities
|
117 |
+
similarities = []
|
118 |
+
for chunk, embedding in zip(index["chunks"], index["embeddings"]):
|
119 |
+
similarity = self._cosine_similarity(query_embedding, embedding)
|
120 |
+
similarities.append((similarity, chunk))
|
121 |
+
self.logger.debug(f"Chunk similarity: {similarity:.3f}")
|
122 |
+
|
123 |
+
# Sort by similarity and return top k
|
124 |
+
similarities.sort(reverse=True)
|
125 |
+
results = [chunk for _, chunk in similarities[:top_k]]
|
126 |
+
self.logger.info(f"Found {len(results)} similar chunks")
|
127 |
+
return results
|
128 |
+
|
129 |
+
except Exception as e:
|
130 |
+
self.logger.error(f"Error finding similar chunks: {str(e)}", exc_info=True)
|
131 |
+
return []
|
132 |
+
|
133 |
+
def _cosine_similarity(self, a: List[float], b: List[float]) -> float:
|
134 |
+
"""Calculate cosine similarity between two vectors."""
|
135 |
+
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
|
src/agents/pdf_agent.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Extract raw text from the uploaded PDF using PyMuPDF.
|
2 |
+
This keeps the implementation minimal for a POC while remaining easy to extend.
|
3 |
+
"""
|
4 |
+
from typing import Dict, Any, List
|
5 |
+
|
6 |
+
import fitz # PyMuPDF
|
7 |
+
|
8 |
+
from .base_agent import BaseAgent
|
9 |
+
|
10 |
+
|
11 |
+
class PDFAgent(BaseAgent):
|
12 |
+
"""Reads the PDF, concatenates all page text and stores it under ``ctx['text']``."""
|
13 |
+
|
14 |
+
def _extract_text(self, pdf_bytes: bytes) -> str:
|
15 |
+
doc = fitz.open(stream=pdf_bytes, filetype="pdf") # type: ignore[arg-type]
|
16 |
+
pages: List[str] = [page.get_text() for page in doc] # list-comp for clarity
|
17 |
+
return "\n".join(pages)
|
18 |
+
|
19 |
+
# -----------------------------------------------------
|
20 |
+
def execute(self, ctx: Dict[str, Any]): # noqa: D401
|
21 |
+
pdf_file = ctx.get("pdf_file")
|
22 |
+
if pdf_file is None:
|
23 |
+
raise ValueError("PDFAgent expected 'pdf_file' in context but none provided.")
|
24 |
+
|
25 |
+
pdf_bytes = pdf_file.read()
|
26 |
+
text = self._extract_text(pdf_bytes)
|
27 |
+
ctx["text"] = text
|
28 |
+
return text
|
src/agents/query_generator.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, Any
|
2 |
+
from .base_agent import BaseAgent
|
3 |
+
|
4 |
+
class QueryGenerator(BaseAgent):
|
5 |
+
def execute(self, ctx: Dict[str, Any]):
|
6 |
+
field = ctx.get("current_field")
|
7 |
+
return f"Followβup query for {field}"
|
src/agents/semantic_reasoner.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, Any
|
2 |
+
from .base_agent import BaseAgent
|
3 |
+
|
4 |
+
class SemanticReasonerAgent(BaseAgent):
|
5 |
+
def execute(self, ctx: Dict[str, Any]):
|
6 |
+
field = ctx.get("current_field")
|
7 |
+
candidate = ctx.get("candidates", {}).get(field)
|
8 |
+
return candidate or f"<unresolved {field}>"
|
src/agents/table_agent.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Extract tables from PDF using Azure Document Intelligence."""
|
2 |
+
from typing import Dict, Any
|
3 |
+
import logging
|
4 |
+
from .base_agent import BaseAgent
|
5 |
+
from services.azure_di_service import AzureDIService
|
6 |
+
|
7 |
+
class TableAgent(BaseAgent):
|
8 |
+
def __init__(self, settings):
|
9 |
+
self.service = AzureDIService(settings.AZURE_DI_ENDPOINT, settings.AZURE_DI_KEY)
|
10 |
+
self.logger = logging.getLogger(__name__)
|
11 |
+
|
12 |
+
def execute(self, ctx: Dict[str, Any]):
|
13 |
+
"""Extract tables from PDF."""
|
14 |
+
try:
|
15 |
+
pdf_file = ctx.get("pdf_file")
|
16 |
+
if not pdf_file:
|
17 |
+
self.logger.error("No PDF file found in context")
|
18 |
+
return {}
|
19 |
+
|
20 |
+
# Get the current position of the file pointer
|
21 |
+
current_pos = pdf_file.tell()
|
22 |
+
self.logger.info(f"Current file position: {current_pos}")
|
23 |
+
|
24 |
+
# Reset to beginning if not at start
|
25 |
+
if current_pos != 0:
|
26 |
+
self.logger.info("Resetting file pointer to beginning")
|
27 |
+
pdf_file.seek(0)
|
28 |
+
|
29 |
+
# Read the file
|
30 |
+
pdf_bytes = pdf_file.read()
|
31 |
+
self.logger.info(f"Read {len(pdf_bytes)} bytes from PDF")
|
32 |
+
|
33 |
+
# Extract content using Azure DI
|
34 |
+
result = self.service.extract_tables(pdf_bytes)
|
35 |
+
|
36 |
+
# Store both text and tables in context
|
37 |
+
ctx["text"] = result["text"]
|
38 |
+
ctx["tables"] = result["tables"]
|
39 |
+
|
40 |
+
self.logger.info(f"Extracted {len(result['text'])} characters of text and {len(result['tables'])} tables")
|
41 |
+
return result
|
42 |
+
|
43 |
+
except Exception as e:
|
44 |
+
self.logger.error(f"Error in TableAgent: {str(e)}")
|
45 |
+
self.logger.exception("Full traceback:")
|
46 |
+
return {}
|
src/app.py
ADDED
@@ -0,0 +1,364 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Streamlit frontβend entryβpoint."""
|
2 |
+
import yaml
|
3 |
+
import json
|
4 |
+
import streamlit as st
|
5 |
+
import logging
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
from orchestrator.planner import Planner
|
8 |
+
from orchestrator.executor import Executor
|
9 |
+
from config.settings import settings
|
10 |
+
import fitz # PyMuPDF local import to avoid heavy load on startup
|
11 |
+
import pandas as pd
|
12 |
+
from datetime import datetime
|
13 |
+
import io
|
14 |
+
import sys
|
15 |
+
from io import StringIO
|
16 |
+
|
17 |
+
# Create a custom stream handler to capture logs
|
18 |
+
class LogCaptureHandler(logging.StreamHandler):
|
19 |
+
def __init__(self):
|
20 |
+
super().__init__()
|
21 |
+
self.logs = []
|
22 |
+
|
23 |
+
def emit(self, record):
|
24 |
+
try:
|
25 |
+
msg = self.format(record)
|
26 |
+
self.logs.append(msg)
|
27 |
+
except Exception:
|
28 |
+
self.handleError(record)
|
29 |
+
|
30 |
+
def get_logs(self):
|
31 |
+
return "\n".join(self.logs)
|
32 |
+
|
33 |
+
def clear(self):
|
34 |
+
self.logs = []
|
35 |
+
|
36 |
+
# Initialize session state for storing execution history
|
37 |
+
if 'execution_history' not in st.session_state:
|
38 |
+
st.session_state.execution_history = []
|
39 |
+
|
40 |
+
# Set up logging capture
|
41 |
+
log_capture = LogCaptureHandler()
|
42 |
+
log_capture.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
|
43 |
+
|
44 |
+
# Configure root logger
|
45 |
+
root_logger = logging.getLogger()
|
46 |
+
root_logger.setLevel(logging.INFO)
|
47 |
+
root_logger.addHandler(log_capture)
|
48 |
+
|
49 |
+
# Configure specific loggers
|
50 |
+
for logger_name in ['orchestrator', 'agents', 'services']:
|
51 |
+
logger = logging.getLogger(logger_name)
|
52 |
+
logger.setLevel(logging.INFO)
|
53 |
+
logger.addHandler(log_capture)
|
54 |
+
|
55 |
+
load_dotenv()
|
56 |
+
|
57 |
+
st.set_page_config(page_title="PDF Field Extractor", layout="wide")
|
58 |
+
|
59 |
+
# Sidebar navigation
|
60 |
+
st.sidebar.title("Navigation")
|
61 |
+
page = st.sidebar.radio("Go to", ["Documentation", "Traces", "Execution"])
|
62 |
+
|
63 |
+
# Documentation Page
|
64 |
+
if page == "Documentation":
|
65 |
+
st.title("DeepβResearch PDF Field Extractor (POC)")
|
66 |
+
|
67 |
+
st.markdown("""
|
68 |
+
This system uses a multi-step pipeline to extract fields from PDFs:
|
69 |
+
1. **Document Intelligence**: Extracts text and tables from PDFs using Azure Document Intelligence
|
70 |
+
2. **Semantic Indexing**: Creates searchable chunks with embeddings for semantic search
|
71 |
+
3. **Field Extraction**: Uses a two-step approach:
|
72 |
+
- First attempts page-by-page scanning for precise extraction
|
73 |
+
- Falls back to semantic search if no value is found
|
74 |
+
4. **Validation**: Ensures extracted values are correct and properly formatted
|
75 |
+
5. **Confidence Scoring**: Identifies which extractions need review
|
76 |
+
""")
|
77 |
+
|
78 |
+
st.markdown("""
|
79 |
+
### Agent Descriptions
|
80 |
+
#### DocumentIntelligenceAgent
|
81 |
+
- Uses Azure Document Intelligence to extract text and tables
|
82 |
+
- Preserves document layout and structure
|
83 |
+
- Outputs both raw text and formatted tables as HTML
|
84 |
+
- Handles complex document layouts and tables
|
85 |
+
|
86 |
+
#### IndexAgent
|
87 |
+
- Creates semantic search index from extracted content
|
88 |
+
- Splits document into manageable chunks with metadata
|
89 |
+
- Generates embeddings for semantic search
|
90 |
+
- Provides both chunk-based and full-text search capabilities
|
91 |
+
- Includes chunk statistics and visualization
|
92 |
+
|
93 |
+
#### FieldMapper
|
94 |
+
- Implements a two-step field extraction strategy:
|
95 |
+
1. Page-by-page scanning for precise extraction
|
96 |
+
2. Semantic search fallback if no value found
|
97 |
+
- Uses document context to improve extraction accuracy
|
98 |
+
- Handles multiple potential values with confidence scoring
|
99 |
+
- Returns structured JSON responses with value details
|
100 |
+
|
101 |
+
#### SemanticReasoner
|
102 |
+
- Validates and cleans up candidate values
|
103 |
+
- Uses domain knowledge to ensure values make sense
|
104 |
+
- Can reformat values to standard format
|
105 |
+
- Returns `<unresolved>` if value is wrong/missing
|
106 |
+
|
107 |
+
#### ConfidenceScorer
|
108 |
+
- Assigns confidence score (0-1) to each extraction
|
109 |
+
- Helps identify which extractions need review
|
110 |
+
- Can trigger follow-up queries when confidence is low
|
111 |
+
|
112 |
+
#### QueryGenerator
|
113 |
+
- Generates follow-up questions when confidence is low
|
114 |
+
- Creates concise questions (β€12 words)
|
115 |
+
- Helps guide system to better extractions
|
116 |
+
""")
|
117 |
+
|
118 |
+
# Traces Page
|
119 |
+
elif page == "Traces":
|
120 |
+
st.title("Execution Traces")
|
121 |
+
|
122 |
+
if not st.session_state.execution_history:
|
123 |
+
st.info("No execution traces available yet. Run an extraction to see traces here.")
|
124 |
+
else:
|
125 |
+
# Create a DataFrame from the execution history
|
126 |
+
history_data = []
|
127 |
+
for record in st.session_state.execution_history:
|
128 |
+
history_data.append({
|
129 |
+
"filename": record["filename"],
|
130 |
+
"datetime": record["datetime"],
|
131 |
+
"fields": ", ".join(record.get("fields", [])),
|
132 |
+
"logs": record.get("logs", []),
|
133 |
+
"results": record.get("results", None)
|
134 |
+
})
|
135 |
+
|
136 |
+
history_df = pd.DataFrame(history_data)
|
137 |
+
|
138 |
+
# Display column headers
|
139 |
+
col1, col2, col3, col4, col5 = st.columns([2, 2, 3, 1, 1])
|
140 |
+
with col1:
|
141 |
+
st.markdown("**Filename**")
|
142 |
+
with col2:
|
143 |
+
st.markdown("**Timestamp**")
|
144 |
+
with col3:
|
145 |
+
st.markdown("**Fields**")
|
146 |
+
with col4:
|
147 |
+
st.markdown("**Logs**")
|
148 |
+
with col5:
|
149 |
+
st.markdown("**Results**")
|
150 |
+
|
151 |
+
st.markdown("---") # Add a separator line
|
152 |
+
|
153 |
+
# Display the table with download buttons
|
154 |
+
for idx, row in history_df.iterrows():
|
155 |
+
col1, col2, col3, col4, col5 = st.columns([2, 2, 3, 1, 1])
|
156 |
+
with col1:
|
157 |
+
st.write(row["filename"])
|
158 |
+
with col2:
|
159 |
+
st.write(row["datetime"])
|
160 |
+
with col3:
|
161 |
+
st.write(row["fields"])
|
162 |
+
with col4:
|
163 |
+
if row["logs"]: # Check if we have any logs
|
164 |
+
st.download_button(
|
165 |
+
"Download Logs",
|
166 |
+
row["logs"], # Use the stored logs
|
167 |
+
file_name=f"logs_{row['filename']}_{row['datetime']}.txt",
|
168 |
+
key=f"logs_dl_{idx}"
|
169 |
+
)
|
170 |
+
else:
|
171 |
+
st.write("No Logs")
|
172 |
+
with col5:
|
173 |
+
if row["results"] is not None:
|
174 |
+
results_df = pd.DataFrame(row["results"])
|
175 |
+
st.download_button(
|
176 |
+
"Download Results",
|
177 |
+
results_df.to_csv(index=False),
|
178 |
+
file_name=f"results_{row['filename']}_{row['datetime']}.csv",
|
179 |
+
key=f"results_dl_{idx}"
|
180 |
+
)
|
181 |
+
else:
|
182 |
+
st.write("No Results")
|
183 |
+
st.markdown("---") # Add a separator line between rows
|
184 |
+
|
185 |
+
# Execution Page
|
186 |
+
else: # page == "Execution"
|
187 |
+
st.title("DeepβResearch PDF Field Extractor (POC)")
|
188 |
+
|
189 |
+
pdf_file = st.file_uploader("Upload PDF", type=["pdf"])
|
190 |
+
fields_str = st.text_input("Fields (commaβseparated)", "Protein Lot, Chain, Residue")
|
191 |
+
desc_blob = st.text_area("Field descriptions / rules (YAML, optional)")
|
192 |
+
|
193 |
+
def flatten_json_response(json_data, fields):
|
194 |
+
"""Flatten the nested JSON response into a tabular structure with dynamic columns."""
|
195 |
+
logger = logging.getLogger(__name__)
|
196 |
+
logger.info("Starting flatten_json_response")
|
197 |
+
logger.info(f"Input fields: {fields}")
|
198 |
+
|
199 |
+
# Handle the case where the response is a string
|
200 |
+
if isinstance(json_data, str):
|
201 |
+
logger.info("Input is a string, attempting to parse as JSON")
|
202 |
+
try:
|
203 |
+
json_data = json.loads(json_data)
|
204 |
+
logger.info("Successfully parsed JSON string")
|
205 |
+
except json.JSONDecodeError as e:
|
206 |
+
logger.error(f"Failed to parse JSON string: {e}")
|
207 |
+
return pd.DataFrame(columns=fields)
|
208 |
+
|
209 |
+
# If the data is wrapped in an array, get the first item
|
210 |
+
if isinstance(json_data, list) and len(json_data) > 0:
|
211 |
+
logger.info("Data is wrapped in an array, extracting first item")
|
212 |
+
json_data = json_data[0]
|
213 |
+
|
214 |
+
# If the data is a dictionary with numeric keys, get the first value
|
215 |
+
if isinstance(json_data, dict):
|
216 |
+
keys = list(json_data.keys())
|
217 |
+
logger.info(f"Checking dictionary keys: {keys}")
|
218 |
+
# Check if all keys are integers or string representations of integers
|
219 |
+
if all(isinstance(k, int) or (isinstance(k, str) and k.isdigit()) for k in keys):
|
220 |
+
logger.info("Data has numeric keys, extracting first value")
|
221 |
+
first_key = sorted(keys, key=lambda x: int(x) if isinstance(x, str) else x)[0]
|
222 |
+
json_data = json_data[first_key]
|
223 |
+
logger.info(f"Extracted data from key '{first_key}'")
|
224 |
+
|
225 |
+
logger.info(f"JSON data keys: {list(json_data.keys()) if isinstance(json_data, dict) else 'Not a dict'}")
|
226 |
+
|
227 |
+
# Create a list to store rows
|
228 |
+
rows = []
|
229 |
+
|
230 |
+
# Get the length of the first array to determine number of rows
|
231 |
+
if isinstance(json_data, dict) and len(json_data) > 0:
|
232 |
+
first_field = list(json_data.keys())[0]
|
233 |
+
num_rows = len(json_data[first_field]) if isinstance(json_data[first_field], list) else 1
|
234 |
+
logger.info(f"Number of rows to process: {num_rows}")
|
235 |
+
|
236 |
+
# Create a row for each index
|
237 |
+
for i in range(num_rows):
|
238 |
+
logger.debug(f"Processing row {i}")
|
239 |
+
row = {}
|
240 |
+
for field in fields:
|
241 |
+
if field in json_data and isinstance(json_data[field], list) and i < len(json_data[field]):
|
242 |
+
row[field] = json_data[field][i]
|
243 |
+
logger.debug(f"Field '{field}' value at index {i}: {json_data[field][i]}")
|
244 |
+
else:
|
245 |
+
row[field] = None
|
246 |
+
logger.debug(f"Field '{field}' not found or index {i} out of bounds")
|
247 |
+
rows.append(row)
|
248 |
+
else:
|
249 |
+
logger.error(f"Unexpected data structure: {type(json_data)}")
|
250 |
+
return pd.DataFrame(columns=fields)
|
251 |
+
|
252 |
+
# Create DataFrame with all requested fields as columns
|
253 |
+
df = pd.DataFrame(rows)
|
254 |
+
logger.info(f"Created DataFrame with shape: {df.shape}")
|
255 |
+
logger.info(f"DataFrame columns: {df.columns.tolist()}")
|
256 |
+
|
257 |
+
# Ensure columns are in the same order as the fields list
|
258 |
+
df = df[fields]
|
259 |
+
logger.info(f"Final DataFrame columns after reordering: {df.columns.tolist()}")
|
260 |
+
|
261 |
+
return df
|
262 |
+
|
263 |
+
if st.button("Run extraction") and pdf_file:
|
264 |
+
field_list = [f.strip() for f in fields_str.split(",") if f.strip()]
|
265 |
+
field_descs = yaml.safe_load(desc_blob) if desc_blob.strip() else {}
|
266 |
+
|
267 |
+
try:
|
268 |
+
with st.spinner("Planning β¦"):
|
269 |
+
# quick first-page text preview to give LLM document context
|
270 |
+
doc = fitz.open(stream=pdf_file.getvalue(), filetype="pdf") # type: ignore[arg-type]
|
271 |
+
preview = "\n".join(page.get_text() for page in doc[:10])[:20000] # first 2 pages, 2k chars
|
272 |
+
|
273 |
+
planner = Planner()
|
274 |
+
plan = planner.build_plan(
|
275 |
+
pdf_meta={"filename": pdf_file.name},
|
276 |
+
doc_preview=preview,
|
277 |
+
fields=field_list,
|
278 |
+
field_descs=field_descs,
|
279 |
+
)
|
280 |
+
|
281 |
+
# Add a visual separator
|
282 |
+
st.markdown("---")
|
283 |
+
|
284 |
+
with st.spinner("Executing β¦"):
|
285 |
+
executor = Executor(settings=settings)
|
286 |
+
results, logs = executor.run(plan, pdf_file)
|
287 |
+
|
288 |
+
# Add detailed logging about what executor returned
|
289 |
+
logger.info(f"Executor returned results of type: {type(results)}")
|
290 |
+
logger.info(f"Results content: {results}")
|
291 |
+
|
292 |
+
# Check if results is already a DataFrame
|
293 |
+
if isinstance(results, pd.DataFrame):
|
294 |
+
logger.info(f"Results is already a DataFrame with shape: {results.shape}")
|
295 |
+
logger.info(f"DataFrame columns: {results.columns.tolist()}")
|
296 |
+
logger.info(f"DataFrame head: {results.head()}")
|
297 |
+
df = results
|
298 |
+
else:
|
299 |
+
logger.info("Results is not a DataFrame, calling flatten_json_response")
|
300 |
+
# Process results using flatten_json_response
|
301 |
+
df = flatten_json_response(results, field_list)
|
302 |
+
|
303 |
+
# Log final DataFrame info
|
304 |
+
logger.info(f"Final DataFrame shape: {df.shape}")
|
305 |
+
logger.info(f"Final DataFrame columns: {df.columns.tolist()}")
|
306 |
+
if not df.empty:
|
307 |
+
logger.info(f"Final DataFrame sample: {df.head()}")
|
308 |
+
|
309 |
+
# Store execution in history
|
310 |
+
execution_record = {
|
311 |
+
"filename": pdf_file.name,
|
312 |
+
"datetime": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
313 |
+
"fields": field_list,
|
314 |
+
"logs": log_capture.get_logs(), # Store the actual logs
|
315 |
+
"results": df.to_dict() if not df.empty else None
|
316 |
+
}
|
317 |
+
st.session_state.execution_history.append(execution_record)
|
318 |
+
log_capture.clear() # Clear logs after storing them
|
319 |
+
|
320 |
+
# ----------------- UI: show execution tree -----------------
|
321 |
+
st.subheader("Execution trace")
|
322 |
+
for log in logs:
|
323 |
+
indent = " " * 4 * log["depth"]
|
324 |
+
# Add error indicator if there was an error
|
325 |
+
error_indicator = "β " if log.get("error") else "β "
|
326 |
+
# Use a fixed preview text instead of the result
|
327 |
+
with st.expander(f"{indent}{error_indicator}{log['tool']} β Click to view result"):
|
328 |
+
st.markdown(f"**Args**: `{log['args']}`", unsafe_allow_html=True)
|
329 |
+
if log.get("error"):
|
330 |
+
st.error(f"Error: {log['error']}")
|
331 |
+
|
332 |
+
# Special handling for IndexAgent output
|
333 |
+
if log['tool'] == "IndexAgent" and isinstance(log["result"], dict):
|
334 |
+
# Display chunk statistics if available
|
335 |
+
if "chunk_stats" in log["result"]:
|
336 |
+
st.markdown("### Chunk Statistics")
|
337 |
+
# Create a DataFrame for better visualization
|
338 |
+
stats_df = pd.DataFrame(log["result"]["chunk_stats"])
|
339 |
+
st.dataframe(stats_df)
|
340 |
+
|
341 |
+
# Add summary statistics
|
342 |
+
st.markdown("### Summary")
|
343 |
+
st.markdown(f"""
|
344 |
+
- Total chunks: {len(stats_df)}
|
345 |
+
- Average chunk length: {stats_df['length'].mean():.0f} characters
|
346 |
+
- Shortest chunk: {stats_df['length'].min()} characters
|
347 |
+
- Longest chunk: {stats_df['length'].max()} characters
|
348 |
+
""")
|
349 |
+
|
350 |
+
# Add a bar chart of chunk lengths
|
351 |
+
st.markdown("### Chunk Length Distribution")
|
352 |
+
st.bar_chart(stats_df.set_index('chunk_number')['length'])
|
353 |
+
else:
|
354 |
+
st.code(log["result"])
|
355 |
+
|
356 |
+
if not df.empty:
|
357 |
+
st.success("Done β")
|
358 |
+
st.dataframe(df)
|
359 |
+
st.download_button("Download CSV", df.to_csv(index=False), "results.csv")
|
360 |
+
else:
|
361 |
+
st.warning("No results were extracted. Check the execution trace for errors.")
|
362 |
+
except Exception as e:
|
363 |
+
logging.exception("App error:")
|
364 |
+
st.error(f"An error occurred: {e}")
|
src/config/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
|
src/config/__pycache__/__init__.cpython-312.pyc
ADDED
Binary file (173 Bytes). View file
|
|
src/config/__pycache__/__init__.cpython-313.pyc
ADDED
Binary file (161 Bytes). View file
|
|
src/config/__pycache__/configurations.cpython-313.pyc
ADDED
Binary file (7.13 kB). View file
|
|
src/config/__pycache__/settings.cpython-312.pyc
ADDED
Binary file (1.27 kB). View file
|
|
src/config/__pycache__/settings.cpython-313.pyc
ADDED
Binary file (1.27 kB). View file
|
|
src/config/field_rules.yaml
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Chain:
|
2 |
+
description: Heavy vs Light chain based on Seq Loc prefix.
|
3 |
+
rules:
|
4 |
+
starts_with: {L: "Light", H: "Heavy"}
|
src/config/prompts.yaml
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
planner: |
|
2 |
+
You are "Doc-to-Record Planner v1" β an expert at designing multi-step
|
3 |
+
extraction pipelines that convert an arbitrary document into a flat record
|
4 |
+
of user-requested fields.
|
5 |
+
|
6 |
+
You will be given:
|
7 |
+
β doc_preview: a few kB of raw text from the uploaded document (may include table HTML).
|
8 |
+
β fields: the list of field names the user wants extracted.
|
9 |
+
β pdf_meta / field_descriptions for extra context.
|
10 |
+
|
11 |
+
Available tools (use exactly these names in the JSON):
|
12 |
+
PDFAgent β extracts raw text from the full PDF.
|
13 |
+
TableAgent β calls Azure Document Intelligence to get HTML tables.
|
14 |
+
FieldMapper β maps one field name to a candidate value.
|
15 |
+
|
16 |
+
Control-flow helper:
|
17 |
+
ForEachField β loops over every requested field and executes the nested "loop" array.
|
18 |
+
|
19 |
+
Output JSON **only** with this schema (no markdown):
|
20 |
+
{
|
21 |
+
"fields": [<same list you received>],
|
22 |
+
"steps": [
|
23 |
+
{"tool": "PDFAgent", "args": {}},
|
24 |
+
{"tool": "TableAgent", "args": {}},
|
25 |
+
{"tool": "ForEachField",
|
26 |
+
"loop": [
|
27 |
+
{"tool": "FieldMapper", "args": {"field": "$field"}}
|
28 |
+
]}
|
29 |
+
]
|
30 |
+
}
|
31 |
+
|
32 |
+
Always include PDFAgent and TableAgent first, then the ForEachField loop. Keep plans short and deterministic.
|
33 |
+
|
34 |
+
field_mapper: |
|
35 |
+
You are "FieldMapper v1" β a precision extractor.
|
36 |
+
Given:
|
37 |
+
β’ field (target field name)
|
38 |
+
β’ context (snippet of raw PDF text or table row)
|
39 |
+
Return **only** the best candidate value (no extra words).
|
40 |
+
|
41 |
+
semantic_reasoner: |
|
42 |
+
You are "Semantic Reasoner v1".
|
43 |
+
Validate the candidate value for a field using domain knowledge and the surrounding context.
|
44 |
+
If the candidate is obviously wrong / absent output <unresolved FIELDNAME> (same token as placeholder).
|
45 |
+
Otherwise output a cleaned, final value β no explanation text.
|
46 |
+
|
47 |
+
confidence_scorer: |
|
48 |
+
You are "Confidence Scorer". For the given field and candidate value assign a confidence between 0 and 1.
|
49 |
+
Output **only** the float.
|
50 |
+
|
51 |
+
query_generator: |
|
52 |
+
You are "Follow-up Query Generator". The previous candidate for a field was low-confidence.
|
53 |
+
Formulate a concise follow-up question (<=12 words) that, when answered, would help identify the field value.
|
src/config/settings.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydantic import Field
|
2 |
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
3 |
+
|
4 |
+
class Settings(BaseSettings):
|
5 |
+
OPENAI_API_KEY: str = Field("", env="OPENAI_API_KEY")
|
6 |
+
AZURE_DI_ENDPOINT: str = Field("", env="AZURE_DI_ENDPOINT")
|
7 |
+
AZURE_DI_KEY: str = Field("", env="AZURE_DI_KEY")
|
8 |
+
|
9 |
+
# Azure OpenAI
|
10 |
+
AZURE_OPENAI_ENDPOINT: str = Field("", env="AZURE_OPENAI_ENDPOINT")
|
11 |
+
AZURE_OPENAI_DEPLOYMENT: str = Field("", env="AZURE_OPENAI_DEPLOYMENT")
|
12 |
+
AZURE_OPENAI_API_VERSION: str = Field("2024-02-15-preview", env="AZURE_OPENAI_API_VERSION")
|
13 |
+
AZURE_OPENAI_API_KEY: str = Field("", env="AZURE_OPENAI_API_KEY")
|
14 |
+
AZURE_OPENAI_EMBEDDING_MODEL: str = Field("text-embedding-3-small", env="AZURE_OPENAI_EMBEDDING_MODEL")
|
15 |
+
|
16 |
+
model_config: SettingsConfigDict = {"env_file": ".env"}
|
17 |
+
|
18 |
+
settings = Settings()
|
src/docker/Dockerfile
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ---------- builder -------------
|
2 |
+
FROM python:3.11-slim AS builder
|
3 |
+
RUN pip install --no-cache-dir uv
|
4 |
+
|
5 |
+
WORKDIR /app
|
6 |
+
COPY pyproject.toml ./
|
7 |
+
RUN uv pip install -r <(uv pip compile --quiet) \
|
8 |
+
&& uv pip freeze > /installed.txt # layer cache
|
9 |
+
|
10 |
+
COPY . .
|
11 |
+
|
12 |
+
# ---------- runtime -------------
|
13 |
+
FROM python:3.11-slim
|
14 |
+
ENV PYTHONUNBUFFERED=1
|
15 |
+
WORKDIR /app
|
16 |
+
COPY --from=builder /installed.txt /installed.txt
|
17 |
+
RUN xargs -a /installed.txt pip install --no-cache-dir
|
18 |
+
|
19 |
+
COPY . .
|
20 |
+
|
21 |
+
ENTRYPOINT ["bash", "docker/entrypoint.sh"]
|
src/docker/entrypoint.sh
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env bash
|
2 |
+
source .env # load secrets (mounted at run)
|
3 |
+
streamlit run app.py --server.port ${PORT:-8501}
|
src/main.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def main():
|
2 |
+
print("Hello from filetorecord!")
|
3 |
+
|
4 |
+
|
5 |
+
if __name__ == "__main__":
|
6 |
+
main()
|
src/orchestrator/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
|
src/orchestrator/__pycache__/__init__.cpython-312.pyc
ADDED
Binary file (179 Bytes). View file
|
|