levalencia commited on
Commit
0a40afa
Β·
1 Parent(s): 0b7edde

Update Dockerfile to use new app entry point and enhance requirements.txt with additional dependencies. Remove obsolete streamlit_app.py file.

Browse files
This view is limited to 50 files because it contains too many changes. Β  See raw diff
Files changed (50) hide show
  1. .DS_Store +0 -0
  2. .env +7 -0
  3. .vscode/launch.json +13 -0
  4. Dockerfile +1 -1
  5. requirements.txt +7 -1
  6. src/README.md +167 -0
  7. src/agents/__init__.py +1 -0
  8. src/agents/__pycache__/__init__.cpython-312.pyc +0 -0
  9. src/agents/__pycache__/__init__.cpython-313.pyc +0 -0
  10. src/agents/__pycache__/base_agent.cpython-312.pyc +0 -0
  11. src/agents/__pycache__/base_agent.cpython-313.pyc +0 -0
  12. src/agents/__pycache__/cache_agent.cpython-313.pyc +0 -0
  13. src/agents/__pycache__/confidence_scorer.cpython-312.pyc +0 -0
  14. src/agents/__pycache__/confidence_scorer.cpython-313.pyc +0 -0
  15. src/agents/__pycache__/document_intelligence_agent.cpython-313.pyc +0 -0
  16. src/agents/__pycache__/field_mapper_agent.cpython-312.pyc +0 -0
  17. src/agents/__pycache__/field_mapper_agent.cpython-313.pyc +0 -0
  18. src/agents/__pycache__/index_agent.cpython-312.pyc +0 -0
  19. src/agents/__pycache__/index_agent.cpython-313.pyc +0 -0
  20. src/agents/__pycache__/pdf_agent.cpython-312.pyc +0 -0
  21. src/agents/__pycache__/pdf_agent.cpython-313.pyc +0 -0
  22. src/agents/__pycache__/query_generator.cpython-312.pyc +0 -0
  23. src/agents/__pycache__/query_generator.cpython-313.pyc +0 -0
  24. src/agents/__pycache__/semantic_reasoner.cpython-312.pyc +0 -0
  25. src/agents/__pycache__/semantic_reasoner.cpython-313.pyc +0 -0
  26. src/agents/__pycache__/table_agent.cpython-312.pyc +0 -0
  27. src/agents/__pycache__/table_agent.cpython-313.pyc +0 -0
  28. src/agents/base_agent.py +8 -0
  29. src/agents/confidence_scorer.py +6 -0
  30. src/agents/field_mapper_agent.py +311 -0
  31. src/agents/index_agent.py +135 -0
  32. src/agents/pdf_agent.py +28 -0
  33. src/agents/query_generator.py +7 -0
  34. src/agents/semantic_reasoner.py +8 -0
  35. src/agents/table_agent.py +46 -0
  36. src/app.py +364 -0
  37. src/config/__init__.py +1 -0
  38. src/config/__pycache__/__init__.cpython-312.pyc +0 -0
  39. src/config/__pycache__/__init__.cpython-313.pyc +0 -0
  40. src/config/__pycache__/configurations.cpython-313.pyc +0 -0
  41. src/config/__pycache__/settings.cpython-312.pyc +0 -0
  42. src/config/__pycache__/settings.cpython-313.pyc +0 -0
  43. src/config/field_rules.yaml +4 -0
  44. src/config/prompts.yaml +53 -0
  45. src/config/settings.py +18 -0
  46. src/docker/Dockerfile +21 -0
  47. src/docker/entrypoint.sh +3 -0
  48. src/main.py +6 -0
  49. src/orchestrator/__init__.py +1 -0
  50. src/orchestrator/__pycache__/__init__.cpython-312.pyc +0 -0
.DS_Store ADDED
Binary file (6.15 kB). View file
 
.env ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ AZURE_DI_ENDPOINT=https://luisvalenciadi.cognitiveservices.azure.com/
2
+ AZURE_DI_KEY=2MMHWymCHlykFwMGtaDHKGcbcWFpMfF29vqWKV6C584Zi9EJj5m4JQQJ99BEACfhMk5XJ3w3AAALACOGUYF3
3
+ OPENAI_API_KEY=1bW5tUhsr8yHXQe0Pra6YdM43fWCUr06KtbGy9gKxVOd7ut5vL9GJQQJ99BDACfhMk5XJ3w3AAAAACOGTQtA
4
+ AZURE_OPENAI_ENDPOINT=https://lvaaifoundry1567796533.openai.azure.com/
5
+ AZURE_OPENAI_DEPLOYMENT=gpt-4.1
6
+ AZURE_OPENAI_API_VERSION=2025-03-01-preview
7
+ AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small
.vscode/launch.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "configurations": [
3
+ {
4
+ "name": "Streamlit (debug)",
5
+ "type": "debugpy",
6
+ "request": "launch",
7
+ "module": "streamlit",
8
+ "args": ["run", "src/app.py"],
9
+ "envFile": "${workspaceFolder}/.env",
10
+ }
11
+ ]
12
+ }
13
+
Dockerfile CHANGED
@@ -18,4 +18,4 @@ EXPOSE 8501
18
 
19
  HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
20
 
21
- ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
 
18
 
19
  HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
20
 
21
+ ENTRYPOINT ["streamlit", "run", "src/app.py", "--server.port=8501", "--server.address=0.0.0.0"]
requirements.txt CHANGED
@@ -1,3 +1,9 @@
1
  altair
2
  pandas
3
- streamlit
 
 
 
 
 
 
 
1
  altair
2
  pandas
3
+ streamlit
4
+ pyyaml
5
+ python-dotenv
6
+ openai
7
+ pydantic-settings
8
+ PyMuPDF
9
+ azure-ai-documentintelligence
src/README.md ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Deep-Research PDF Field Extractor
2
+
3
+ A powerful tool for extracting structured data from PDF documents, designed to handle various document types and extract specific fields of interest.
4
+
5
+ ## For End Users
6
+
7
+ ### Overview
8
+ The PDF Field Extractor helps you extract specific information from PDF documents. It can extract any fields you specify, such as dates, names, values, locations, and more. The tool is particularly useful for converting unstructured PDF data into structured, analyzable formats.
9
+
10
+ ### How to Use
11
+
12
+ 1. **Upload Your PDF**
13
+ - Click the "Upload PDF" button
14
+ - Select your PDF file from your computer
15
+
16
+ 2. **Specify Fields to Extract**
17
+ - Enter the fields you want to extract, separated by commas
18
+ - Example: `Date, Name, Value, Location, Page, FileName`
19
+
20
+ 3. **Optional: Add Field Descriptions**
21
+ - You can provide additional context about the fields.
22
+ - This helps the system better understand what to look for
23
+
24
+ 4. **Run Extraction**
25
+ - Click the "Run extraction" button
26
+ - Wait for the process to complete
27
+ - View your results in a table format
28
+
29
+ 5. **Download Results**
30
+ - Download your extracted data as a CSV file
31
+ - View execution traces and logs if needed
32
+
33
+ ### Features
34
+ - Automatic document type detection
35
+ - Smart field extraction
36
+ - Support for tables and text
37
+ - Detailed execution traces
38
+ - Downloadable results and logs
39
+
40
+ ## For Developers
41
+
42
+ ### Architecture Overview
43
+
44
+ The application is built using a multi-agent architecture with the following components:
45
+
46
+ #### Core Components
47
+
48
+ 1. **Planner (`orchestrator/planner.py`)**
49
+ - Generates execution plans using Azure OpenAI
50
+
51
+
52
+ 2. **Executor (`orchestrator/executor.py`)**
53
+ - Executes the generated plan
54
+ - Manages agent execution flow
55
+ - Handles context and result management
56
+
57
+ 3. **Agents**
58
+ - `PDFAgent`: Extracts text from PDFs
59
+ - `TableAgent`: Extracts tables from PDFs
60
+ - `FieldMapper`: Maps fields to values
61
+ - `ForEachField`: Control flow for field iteration
62
+
63
+ ### Agent Pipeline
64
+
65
+ 1. **Document Processing**
66
+ ```python
67
+ # Document is processed in stages:
68
+ 1. PDF text extraction
69
+ 2. Table extraction
70
+ 3. Field mapping
71
+ ```
72
+
73
+ 2. **Field Extraction Process**
74
+ - Document type inference
75
+ - User profile determination
76
+ - Page-by-page scanning
77
+ - Value extraction and validation
78
+
79
+ 3. **Context Building**
80
+ - Document metadata
81
+ - Field descriptions
82
+ - User context
83
+ - Execution history
84
+
85
+ ### Key Features
86
+
87
+ #### Document Type Inference
88
+ The system automatically infers document type and user profile:
89
+ ```python
90
+ # Example inference:
91
+ "Document type: Analytical report
92
+ User profile: Data analysts or researchers working with document analysis"
93
+ ```
94
+
95
+ #### Field Mapping
96
+ The FieldMapper agent uses a sophisticated approach:
97
+ 1. Document context analysis
98
+ 2. Page-by-page scanning
99
+ 3. Value extraction using LLM
100
+ 4. Result validation
101
+
102
+ #### Execution Traces
103
+ The system maintains detailed execution traces:
104
+ - Tool execution history
105
+ - Success/failure status
106
+ - Detailed logs
107
+ - Result storage
108
+
109
+ ### Technical Setup
110
+
111
+ 1. **Dependencies**
112
+ ```python
113
+ # Key dependencies:
114
+ - streamlit
115
+ - pandas
116
+ - PyMuPDF (fitz)
117
+ - Azure OpenAI
118
+ - Azure Document Intelligence
119
+ ```
120
+
121
+ 2. **Configuration**
122
+ - Environment variables for API keys
123
+ - Prompt templates in `config/prompts.yaml`
124
+ - Settings in `config/settings.py`
125
+
126
+ 3. **Logging System**
127
+ ```python
128
+ # Custom logging setup:
129
+ - LogCaptureHandler for UI display
130
+ - Structured logging format
131
+ - Execution history storage
132
+ ```
133
+
134
+ ### Development Guidelines
135
+
136
+ 1. **Adding New Agents**
137
+ - Inherit from base agent class
138
+ - Implement required methods
139
+ - Add to planner configuration
140
+
141
+ 2. **Modifying Extraction Logic**
142
+ - Update prompt templates
143
+ - Modify field mapping logic
144
+ - Adjust validation rules
145
+
146
+ 3. **Extending Functionality**
147
+ - Add new field types
148
+ - Implement custom validators
149
+ - Create new output formats
150
+
151
+ ### Testing
152
+ - Unit tests for agents
153
+ - Integration tests for pipeline
154
+ - End-to-end testing with sample PDFs
155
+
156
+ ### Deployment
157
+ - Streamlit app deployment
158
+ - Environment configuration
159
+ - API key management
160
+ - Logging setup
161
+
162
+ ### Future Improvements
163
+ - Enhanced error handling
164
+ - Additional field types
165
+ - Improved validation
166
+ - Performance optimization
167
+ - Extended documentation
src/agents/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
src/agents/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (173 Bytes). View file
 
src/agents/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (161 Bytes). View file
 
src/agents/__pycache__/base_agent.cpython-312.pyc ADDED
Binary file (757 Bytes). View file
 
src/agents/__pycache__/base_agent.cpython-313.pyc ADDED
Binary file (805 Bytes). View file
 
src/agents/__pycache__/cache_agent.cpython-313.pyc ADDED
Binary file (7.52 kB). View file
 
src/agents/__pycache__/confidence_scorer.cpython-312.pyc ADDED
Binary file (659 Bytes). View file
 
src/agents/__pycache__/confidence_scorer.cpython-313.pyc ADDED
Binary file (707 Bytes). View file
 
src/agents/__pycache__/document_intelligence_agent.cpython-313.pyc ADDED
Binary file (3.07 kB). View file
 
src/agents/__pycache__/field_mapper_agent.cpython-312.pyc ADDED
Binary file (18 kB). View file
 
src/agents/__pycache__/field_mapper_agent.cpython-313.pyc ADDED
Binary file (18.2 kB). View file
 
src/agents/__pycache__/index_agent.cpython-312.pyc ADDED
Binary file (7.63 kB). View file
 
src/agents/__pycache__/index_agent.cpython-313.pyc ADDED
Binary file (7.72 kB). View file
 
src/agents/__pycache__/pdf_agent.cpython-312.pyc ADDED
Binary file (1.71 kB). View file
 
src/agents/__pycache__/pdf_agent.cpython-313.pyc ADDED
Binary file (1.75 kB). View file
 
src/agents/__pycache__/query_generator.cpython-312.pyc ADDED
Binary file (762 Bytes). View file
 
src/agents/__pycache__/query_generator.cpython-313.pyc ADDED
Binary file (810 Bytes). View file
 
src/agents/__pycache__/semantic_reasoner.cpython-312.pyc ADDED
Binary file (902 Bytes). View file
 
src/agents/__pycache__/semantic_reasoner.cpython-313.pyc ADDED
Binary file (960 Bytes). View file
 
src/agents/__pycache__/table_agent.cpython-312.pyc ADDED
Binary file (2.74 kB). View file
 
src/agents/__pycache__/table_agent.cpython-313.pyc ADDED
Binary file (2.81 kB). View file
 
src/agents/base_agent.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ """Abstract base class for every agent."""
2
+ from abc import ABC, abstractmethod
3
+ from typing import Dict, Any
4
+
5
+ class BaseAgent(ABC):
6
+ @abstractmethod
7
+ def execute(self, ctx: Dict[str, Any]):
8
+ """Mutate / consume ctx and return a value."""
src/agents/confidence_scorer.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from typing import Dict, Any
2
+ from .base_agent import BaseAgent
3
+
4
+ class ConfidenceScorer(BaseAgent):
5
+ def execute(self, ctx: Dict[str, Any]):
6
+ return 1.0 # always confident for stub
src/agents/field_mapper_agent.py ADDED
@@ -0,0 +1,311 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Map a single field to a candidate value using page-by-page analysis and LLM-based extraction."""
2
+ from typing import Dict, Any, Optional, List
3
+ import logging
4
+ import re
5
+ import json
6
+ from .base_agent import BaseAgent
7
+ from services.llm_client import LLMClient
8
+ from services.embedding_client import EmbeddingClient
9
+ from config.settings import settings
10
+
11
+ # Configure logging to disable verbose Azure HTTP logs
12
+ logging.getLogger('azure.core.pipeline.policies.http_logging_policy').setLevel(logging.WARNING)
13
+ logging.getLogger('azure.core.pipeline').setLevel(logging.WARNING)
14
+ logging.getLogger('azure').setLevel(logging.WARNING)
15
+
16
+ class FieldMapperAgent(BaseAgent):
17
+ def __init__(self):
18
+ self.logger = logging.getLogger(__name__)
19
+ self.llm = LLMClient(settings)
20
+ self.embedding_client = EmbeddingClient()
21
+
22
+ def _infer_document_context(self, text: str) -> str:
23
+ """Use LLM to infer document context and user profile."""
24
+ prompt = f"""Given this document text, describe the document type and typical user profile in 1-2 sentences.
25
+ Focus on the domain, purpose, and who would use this document.
26
+
27
+ Document text:
28
+ {text[:2000]} # First 2000 chars for context
29
+
30
+ Response format:
31
+ Document type: [type]
32
+ User profile: [profile]
33
+ """
34
+
35
+ try:
36
+ self.logger.info("Inferring document context...")
37
+ self.logger.debug(f"Using text preview: {text[:500]}...")
38
+ context = self.llm.responses(prompt, temperature=0.0)
39
+ self.logger.info(f"Inferred context: {context}")
40
+ return context
41
+ except Exception as e:
42
+ self.logger.error(f"Error inferring context: {str(e)}")
43
+ return "Generic document user"
44
+
45
+ def _find_similar_chunks_search(self, query: str, index: Dict[str, Any], top_k: int = 3) -> List[Dict[str, Any]]:
46
+ """Find chunks semantically similar to the query using cosine similarity."""
47
+ try:
48
+ self.logger.info(f"Finding similar chunks for query: {query}")
49
+ self.logger.debug(f"Index contains {len(index['chunks'])} chunks and {len(index['embeddings'])} embeddings")
50
+
51
+ # Get query embedding
52
+ self.logger.debug("Generating embedding for query...")
53
+ query_embedding = self.embedding_client.embed([query])[0]
54
+ self.logger.debug(f"Query embedding generated, length: {len(query_embedding)}")
55
+
56
+ # Calculate similarities
57
+ similarities = []
58
+ for i, (chunk, embedding) in enumerate(zip(index["chunks"], index["embeddings"])):
59
+ similarity = self._cosine_similarity(query_embedding, embedding)
60
+ similarities.append((similarity, chunk))
61
+ self.logger.debug(f"Chunk {i} similarity: {similarity:.3f}")
62
+ self.logger.debug(f"Chunk {i} preview: {chunk['text'][:100]}...")
63
+
64
+ # Sort by similarity and return top k
65
+ similarities.sort(reverse=True)
66
+ results = [chunk for _, chunk in similarities[:top_k]]
67
+
68
+ # Log top results
69
+ self.logger.info(f"Found {len(results)} similar chunks")
70
+ for i, (sim, chunk) in enumerate(similarities[:top_k]):
71
+ self.logger.info(f"Top {i+1} match (similarity: {sim:.3f}): {chunk['text'][:200]}...")
72
+
73
+ return results
74
+
75
+ except Exception as e:
76
+ self.logger.error(f"Error finding similar chunks: {str(e)}", exc_info=True)
77
+ return []
78
+
79
+ def _cosine_similarity(self, a: List[float], b: List[float]) -> float:
80
+ """Calculate cosine similarity between two vectors."""
81
+ import numpy as np
82
+ try:
83
+ # Check for zero vectors
84
+ if not a or not b or all(x == 0 for x in a) or all(x == 0 for x in b):
85
+ self.logger.warning("Zero vector detected in cosine similarity calculation")
86
+ return 0.0
87
+
88
+ # Convert to numpy arrays
89
+ a_np = np.array(a)
90
+ b_np = np.array(b)
91
+
92
+ # Calculate norms
93
+ norm_a = np.linalg.norm(a_np)
94
+ norm_b = np.linalg.norm(b_np)
95
+
96
+ # Check for zero norms
97
+ if norm_a == 0 or norm_b == 0:
98
+ self.logger.warning("Zero norm detected in cosine similarity calculation")
99
+ return 0.0
100
+
101
+ # Calculate similarity
102
+ similarity = np.dot(a_np, b_np) / (norm_a * norm_b)
103
+
104
+ # Check for NaN
105
+ if np.isnan(similarity):
106
+ self.logger.warning("NaN detected in cosine similarity calculation")
107
+ return 0.0
108
+
109
+ return float(similarity)
110
+ except Exception as e:
111
+ self.logger.error(f"Error calculating cosine similarity: {str(e)}")
112
+ return 0.0
113
+
114
+ def _extract_field_value_search(self, field: str, chunks: List[Dict[str, Any]], context: str) -> Optional[str]:
115
+ """Use LLM to extract field value from relevant chunks."""
116
+ # Combine chunks into context
117
+ chunk_texts = [chunk["text"] for chunk in chunks]
118
+ combined_context = "\n".join(chunk_texts)
119
+
120
+ self.logger.info(f"Extracting value for field '{field}' from {len(chunks)} chunks")
121
+ self.logger.debug(f"Combined context preview: {combined_context[:500]}...")
122
+
123
+ # Get filename from context if available
124
+ filename = self.ctx.get("pdf_meta", {}).get("filename", "")
125
+ filename_context = f"\nDocument filename: {filename}" if filename else ""
126
+
127
+ prompt = f"""You are an expert in {context}
128
+
129
+ Your task is to extract the value for the field: {field}{filename_context}
130
+
131
+ Consider the following context from the document:
132
+ {combined_context}
133
+
134
+ Instructions:
135
+ 1. Look for the field value in the context
136
+ 2. If you find multiple potential values, choose the most relevant one
137
+ 3. If you're not sure, return None
138
+ 4. Return ONLY the value, no explanations
139
+
140
+ Field value:"""
141
+
142
+ try:
143
+ self.logger.info(f"Calling LLM to extract value for field '{field}'")
144
+ self.logger.debug(f"Using prompt: {prompt}")
145
+ value = self.llm.responses(prompt, temperature=0.0)
146
+ self.logger.debug(f"Raw LLM response: {value}")
147
+
148
+ if value and value.lower() not in ["none", "null", "n/a"]:
149
+ self.logger.info(f"Successfully extracted value: {value}")
150
+ return value.strip()
151
+ else:
152
+ self.logger.warning(f"LLM returned no valid value for field '{field}'")
153
+ return None
154
+ except Exception as e:
155
+ self.logger.error(f"Error extracting field value: {str(e)}", exc_info=True)
156
+ return None
157
+
158
+ def _extract_field_value_from_page(self, field: str, page_text: str, context: str) -> Optional[str]:
159
+ """Use LLM to extract field value from a single page."""
160
+ self.logger.info(f"Extracting value for field '{field}' from page")
161
+ self.logger.debug(f"Page text preview: {page_text[:500]}...")
162
+
163
+ # Get filename from context if available
164
+ filename = self.ctx.get("pdf_meta", {}).get("filename", "")
165
+ filename_context = f"\nDocument filename: {filename}" if filename else ""
166
+
167
+ prompt = f"""You are an expert in {context}
168
+
169
+ Your task is to extract the value for the field: {field}{filename_context}
170
+
171
+ Consider the following page from the document:
172
+ {page_text}
173
+
174
+ Instructions:
175
+ 1. Look for the field values in this page
176
+ 2. Return the data in a tabular format where each field is a column
177
+ 3. Each field should have an array of values
178
+ 4. The arrays must be aligned (same length) to represent rows
179
+ 5. Return ONLY the JSON value, no explanations
180
+ 6. Format the response as a valid JSON object with field names as keys
181
+ 7. Keep the structure flat - do not nest values under 'details' or other keys
182
+
183
+ Example response format:
184
+ {{
185
+ "field1": ["value1", "value2", "value3"],
186
+ "field2": ["value4", "value5", "value6"],
187
+ "field3": ["value7", "value8", "value9"]
188
+ }}
189
+
190
+ Field value:"""
191
+
192
+ try:
193
+ self.logger.info(f"Calling LLM to extract value for field '{field}' from page")
194
+ value = self.llm.responses(prompt, temperature=0.0)
195
+ self.logger.debug(f"Raw LLM response: {value}")
196
+
197
+ if value and value.lower() not in ["none", "null", "n/a"]:
198
+ # Try to parse as JSON to ensure it's valid
199
+ try:
200
+ json_value = json.loads(value)
201
+ self.logger.info(f"Successfully extracted value: {json.dumps(json_value, indent=2)}")
202
+ return json.dumps(json_value, indent=2)
203
+ except json.JSONDecodeError:
204
+ # If not valid JSON, wrap it in a JSON object
205
+ json_value = {field: value.strip()}
206
+ self.logger.info(f"Wrapped non-JSON value in JSON object: {json.dumps(json_value, indent=2)}")
207
+ return json.dumps(json_value, indent=2)
208
+ else:
209
+ self.logger.warning(f"LLM returned no valid value for field '{field}'")
210
+ return None
211
+ except Exception as e:
212
+ self.logger.error(f"Error extracting field value from page: {str(e)}", exc_info=True)
213
+ return None
214
+
215
+ def execute(self, ctx: Dict[str, Any]): # noqa: D401
216
+ field = ctx.get("current_field")
217
+ self.logger.info(f"Starting field mapping for: {field}")
218
+
219
+ # Store context for use in extraction methods
220
+ self.ctx = ctx
221
+
222
+ # Get text and index
223
+ text = ""
224
+ index = None
225
+ if "index" in ctx and isinstance(ctx["index"], dict):
226
+ index = ctx["index"]
227
+ text = index.get("text", "")
228
+ self.logger.info(f"Using text from index (length: {len(text)})")
229
+ self.logger.debug(f"Index contains {len(index.get('chunks', []))} chunks")
230
+ self.logger.debug(f"Index contains {len(index.get('embeddings', []))} embeddings")
231
+ elif "text" in ctx:
232
+ text = ctx["text"]
233
+ self.logger.info(f"Using text from direct context (length: {len(text)})")
234
+
235
+ if not field:
236
+ self.logger.warning("No field provided in context")
237
+ return None
238
+ if not text:
239
+ self.logger.warning("No text content found in context or index")
240
+ return None
241
+
242
+ # Infer document context if not already present
243
+ if "document_context" not in ctx:
244
+ ctx["document_context"] = self._infer_document_context(text)
245
+
246
+ self.logger.info(f"Processing field: {field}")
247
+ self.logger.info(f"Using document context: {ctx['document_context']}")
248
+
249
+ # Split text into pages using the same markers as IndexAgent
250
+ page_markers = [
251
+ '<!-- PageBreak -->',
252
+ 'Page',
253
+ '---',
254
+ '\n\n', # Double newline as fallback
255
+ ]
256
+
257
+ # First try to find which marker is used
258
+ used_marker = None
259
+ for marker in page_markers:
260
+ if marker in text:
261
+ used_marker = marker
262
+ self.logger.info(f"Found page marker: {marker}")
263
+ break
264
+
265
+ if not used_marker:
266
+ self.logger.warning("No page markers found, falling back to double newline")
267
+ used_marker = '\n\n'
268
+
269
+ # Split the text
270
+ if used_marker == '\n\n':
271
+ pages = [p.strip() for p in text.split(used_marker) if p.strip()]
272
+ else:
273
+ pages = []
274
+ current_page = []
275
+
276
+ for line in text.split('\n'):
277
+ if used_marker in line:
278
+ if current_page:
279
+ pages.append('\n'.join(current_page))
280
+ current_page = [line]
281
+ else:
282
+ current_page.append(line)
283
+
284
+ if current_page:
285
+ pages.append('\n'.join(current_page))
286
+
287
+ self.logger.info(f"Split document into {len(pages)} pages")
288
+
289
+ # Process each page
290
+ for i, page in enumerate(pages, 1):
291
+ self.logger.info(f"Processing page {i}/{len(pages)}")
292
+ value = self._extract_field_value_from_page(field, page, ctx["document_context"])
293
+ if value:
294
+ return value
295
+
296
+ # If no value found in any page, try the search-based approach as fallback
297
+ self.logger.warning("No value found in page-by-page analysis, falling back to search-based approach")
298
+
299
+ if index and "embeddings" in index:
300
+ self.logger.info("Using semantic search with embeddings")
301
+ search_query = f"{field} in {ctx['document_context']}"
302
+ similar_chunks = self._find_similar_chunks_search(search_query, index)
303
+
304
+ if similar_chunks:
305
+ self.logger.info(f"Found {len(similar_chunks)} relevant chunks, attempting value extraction")
306
+ value = self._extract_field_value_search(field, similar_chunks, ctx["document_context"])
307
+ if value:
308
+ return value
309
+
310
+ self.logger.warning(f"No candidate found for field: {field}")
311
+ return f"<no candidate for {field}>"
src/agents/index_agent.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Create a semantic index of document content using embeddings."""
2
+ from typing import Dict, Any, List, Tuple
3
+ import logging
4
+ import numpy as np
5
+ from .base_agent import BaseAgent
6
+ from services.embedding_client import EmbeddingClient
7
+
8
+ class IndexAgent(BaseAgent):
9
+ def __init__(self):
10
+ self.logger = logging.getLogger(__name__)
11
+ self.embedding_client = EmbeddingClient()
12
+ self.logger.info("IndexAgent initialized")
13
+
14
+ def execute(self, ctx: Dict[str, Any]):
15
+ """Create a semantic index of document content."""
16
+ try:
17
+ self.logger.info("Starting index creation")
18
+
19
+ # Get text from PDF agent
20
+ text = ctx.get("text", "")
21
+ if not text:
22
+ self.logger.warning("No text content found in context")
23
+ return {}
24
+ self.logger.info(f"Found text content of length {len(text)}")
25
+
26
+ # Get tables from Table agent
27
+ tables = ctx.get("tables", [])
28
+ self.logger.info(f"Found {len(tables)} tables in context")
29
+
30
+ # Combine all content
31
+ all_content = text
32
+ if tables:
33
+ all_content += "\n".join(tables)
34
+ self.logger.info(f"Combined content length: {len(all_content)}")
35
+
36
+ # Create chunks with metadata
37
+ chunks = self._create_chunks(all_content)
38
+ self.logger.info(f"Created {len(chunks)} content chunks")
39
+ for i, chunk in enumerate(chunks):
40
+ self.logger.debug(f"Chunk {i}: {chunk['text'][:100]}...")
41
+
42
+ # Get embeddings for chunks
43
+ chunk_texts = [chunk["text"] for chunk in chunks]
44
+ self.logger.info(f"Getting embeddings for {len(chunk_texts)} chunks")
45
+ embeddings = self.embedding_client.embed(chunk_texts)
46
+ self.logger.info(f"Generated {len(embeddings)} embeddings")
47
+
48
+ # Create semantic index
49
+ index = {
50
+ "chunks": chunks,
51
+ "embeddings": embeddings,
52
+ "text": all_content, # Keep full text for non-semantic search
53
+ }
54
+
55
+ # Store in context
56
+ ctx["index"] = index
57
+ self.logger.info(f"Created semantic index with {len(chunks)} chunks")
58
+ return index
59
+
60
+ except Exception as e:
61
+ self.logger.error(f"Error in IndexAgent: {str(e)}", exc_info=True)
62
+ return {}
63
+
64
+ def _create_chunks(self, text: str, chunk_size: int = 1000) -> List[Dict[str, Any]]:
65
+ """Split text into chunks with metadata."""
66
+ self.logger.info(f"Creating chunks from text of length {len(text)}")
67
+ chunks = []
68
+ sentences = text.split(". ")
69
+ self.logger.info(f"Split into {len(sentences)} sentences")
70
+ current_chunk = []
71
+ current_size = 0
72
+ total_length = 0
73
+
74
+ for sentence in sentences:
75
+ sentence = sentence.strip() + ". "
76
+ sentence_size = len(sentence)
77
+
78
+ if current_size + sentence_size > chunk_size and current_chunk:
79
+ # Save current chunk
80
+ chunk_text = "".join(current_chunk)
81
+ chunks.append({
82
+ "text": chunk_text,
83
+ "start": total_length,
84
+ "end": total_length + len(chunk_text),
85
+ "type": "text"
86
+ })
87
+ total_length += len(chunk_text)
88
+ self.logger.debug(f"Created chunk of size {len(chunk_text)}")
89
+ current_chunk = []
90
+ current_size = 0
91
+
92
+ current_chunk.append(sentence)
93
+ current_size += sentence_size
94
+
95
+ # Add last chunk if any
96
+ if current_chunk:
97
+ chunk_text = "".join(current_chunk)
98
+ chunks.append({
99
+ "text": chunk_text,
100
+ "start": total_length,
101
+ "end": total_length + len(chunk_text),
102
+ "type": "text"
103
+ })
104
+ self.logger.debug(f"Created final chunk of size {len(chunk_text)}")
105
+
106
+ self.logger.info(f"Created {len(chunks)} total chunks")
107
+ return chunks
108
+
109
+ def find_similar_chunks(self, query: str, index: Dict[str, Any], top_k: int = 3) -> List[Dict[str, Any]]:
110
+ """Find chunks semantically similar to the query."""
111
+ try:
112
+ self.logger.info(f"Finding similar chunks for query: {query}")
113
+ # Get query embedding
114
+ query_embedding = self.embedding_client.embed([query])[0]
115
+
116
+ # Calculate similarities
117
+ similarities = []
118
+ for chunk, embedding in zip(index["chunks"], index["embeddings"]):
119
+ similarity = self._cosine_similarity(query_embedding, embedding)
120
+ similarities.append((similarity, chunk))
121
+ self.logger.debug(f"Chunk similarity: {similarity:.3f}")
122
+
123
+ # Sort by similarity and return top k
124
+ similarities.sort(reverse=True)
125
+ results = [chunk for _, chunk in similarities[:top_k]]
126
+ self.logger.info(f"Found {len(results)} similar chunks")
127
+ return results
128
+
129
+ except Exception as e:
130
+ self.logger.error(f"Error finding similar chunks: {str(e)}", exc_info=True)
131
+ return []
132
+
133
+ def _cosine_similarity(self, a: List[float], b: List[float]) -> float:
134
+ """Calculate cosine similarity between two vectors."""
135
+ return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
src/agents/pdf_agent.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Extract raw text from the uploaded PDF using PyMuPDF.
2
+ This keeps the implementation minimal for a POC while remaining easy to extend.
3
+ """
4
+ from typing import Dict, Any, List
5
+
6
+ import fitz # PyMuPDF
7
+
8
+ from .base_agent import BaseAgent
9
+
10
+
11
+ class PDFAgent(BaseAgent):
12
+ """Reads the PDF, concatenates all page text and stores it under ``ctx['text']``."""
13
+
14
+ def _extract_text(self, pdf_bytes: bytes) -> str:
15
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf") # type: ignore[arg-type]
16
+ pages: List[str] = [page.get_text() for page in doc] # list-comp for clarity
17
+ return "\n".join(pages)
18
+
19
+ # -----------------------------------------------------
20
+ def execute(self, ctx: Dict[str, Any]): # noqa: D401
21
+ pdf_file = ctx.get("pdf_file")
22
+ if pdf_file is None:
23
+ raise ValueError("PDFAgent expected 'pdf_file' in context but none provided.")
24
+
25
+ pdf_bytes = pdf_file.read()
26
+ text = self._extract_text(pdf_bytes)
27
+ ctx["text"] = text
28
+ return text
src/agents/query_generator.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Any
2
+ from .base_agent import BaseAgent
3
+
4
+ class QueryGenerator(BaseAgent):
5
+ def execute(self, ctx: Dict[str, Any]):
6
+ field = ctx.get("current_field")
7
+ return f"Follow‑up query for {field}"
src/agents/semantic_reasoner.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Any
2
+ from .base_agent import BaseAgent
3
+
4
+ class SemanticReasonerAgent(BaseAgent):
5
+ def execute(self, ctx: Dict[str, Any]):
6
+ field = ctx.get("current_field")
7
+ candidate = ctx.get("candidates", {}).get(field)
8
+ return candidate or f"<unresolved {field}>"
src/agents/table_agent.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Extract tables from PDF using Azure Document Intelligence."""
2
+ from typing import Dict, Any
3
+ import logging
4
+ from .base_agent import BaseAgent
5
+ from services.azure_di_service import AzureDIService
6
+
7
+ class TableAgent(BaseAgent):
8
+ def __init__(self, settings):
9
+ self.service = AzureDIService(settings.AZURE_DI_ENDPOINT, settings.AZURE_DI_KEY)
10
+ self.logger = logging.getLogger(__name__)
11
+
12
+ def execute(self, ctx: Dict[str, Any]):
13
+ """Extract tables from PDF."""
14
+ try:
15
+ pdf_file = ctx.get("pdf_file")
16
+ if not pdf_file:
17
+ self.logger.error("No PDF file found in context")
18
+ return {}
19
+
20
+ # Get the current position of the file pointer
21
+ current_pos = pdf_file.tell()
22
+ self.logger.info(f"Current file position: {current_pos}")
23
+
24
+ # Reset to beginning if not at start
25
+ if current_pos != 0:
26
+ self.logger.info("Resetting file pointer to beginning")
27
+ pdf_file.seek(0)
28
+
29
+ # Read the file
30
+ pdf_bytes = pdf_file.read()
31
+ self.logger.info(f"Read {len(pdf_bytes)} bytes from PDF")
32
+
33
+ # Extract content using Azure DI
34
+ result = self.service.extract_tables(pdf_bytes)
35
+
36
+ # Store both text and tables in context
37
+ ctx["text"] = result["text"]
38
+ ctx["tables"] = result["tables"]
39
+
40
+ self.logger.info(f"Extracted {len(result['text'])} characters of text and {len(result['tables'])} tables")
41
+ return result
42
+
43
+ except Exception as e:
44
+ self.logger.error(f"Error in TableAgent: {str(e)}")
45
+ self.logger.exception("Full traceback:")
46
+ return {}
src/app.py ADDED
@@ -0,0 +1,364 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Streamlit front‑end entry‑point."""
2
+ import yaml
3
+ import json
4
+ import streamlit as st
5
+ import logging
6
+ from dotenv import load_dotenv
7
+ from orchestrator.planner import Planner
8
+ from orchestrator.executor import Executor
9
+ from config.settings import settings
10
+ import fitz # PyMuPDF local import to avoid heavy load on startup
11
+ import pandas as pd
12
+ from datetime import datetime
13
+ import io
14
+ import sys
15
+ from io import StringIO
16
+
17
+ # Create a custom stream handler to capture logs
18
+ class LogCaptureHandler(logging.StreamHandler):
19
+ def __init__(self):
20
+ super().__init__()
21
+ self.logs = []
22
+
23
+ def emit(self, record):
24
+ try:
25
+ msg = self.format(record)
26
+ self.logs.append(msg)
27
+ except Exception:
28
+ self.handleError(record)
29
+
30
+ def get_logs(self):
31
+ return "\n".join(self.logs)
32
+
33
+ def clear(self):
34
+ self.logs = []
35
+
36
+ # Initialize session state for storing execution history
37
+ if 'execution_history' not in st.session_state:
38
+ st.session_state.execution_history = []
39
+
40
+ # Set up logging capture
41
+ log_capture = LogCaptureHandler()
42
+ log_capture.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
43
+
44
+ # Configure root logger
45
+ root_logger = logging.getLogger()
46
+ root_logger.setLevel(logging.INFO)
47
+ root_logger.addHandler(log_capture)
48
+
49
+ # Configure specific loggers
50
+ for logger_name in ['orchestrator', 'agents', 'services']:
51
+ logger = logging.getLogger(logger_name)
52
+ logger.setLevel(logging.INFO)
53
+ logger.addHandler(log_capture)
54
+
55
+ load_dotenv()
56
+
57
+ st.set_page_config(page_title="PDF Field Extractor", layout="wide")
58
+
59
+ # Sidebar navigation
60
+ st.sidebar.title("Navigation")
61
+ page = st.sidebar.radio("Go to", ["Documentation", "Traces", "Execution"])
62
+
63
+ # Documentation Page
64
+ if page == "Documentation":
65
+ st.title("Deep‑Research PDF Field Extractor (POC)")
66
+
67
+ st.markdown("""
68
+ This system uses a multi-step pipeline to extract fields from PDFs:
69
+ 1. **Document Intelligence**: Extracts text and tables from PDFs using Azure Document Intelligence
70
+ 2. **Semantic Indexing**: Creates searchable chunks with embeddings for semantic search
71
+ 3. **Field Extraction**: Uses a two-step approach:
72
+ - First attempts page-by-page scanning for precise extraction
73
+ - Falls back to semantic search if no value is found
74
+ 4. **Validation**: Ensures extracted values are correct and properly formatted
75
+ 5. **Confidence Scoring**: Identifies which extractions need review
76
+ """)
77
+
78
+ st.markdown("""
79
+ ### Agent Descriptions
80
+ #### DocumentIntelligenceAgent
81
+ - Uses Azure Document Intelligence to extract text and tables
82
+ - Preserves document layout and structure
83
+ - Outputs both raw text and formatted tables as HTML
84
+ - Handles complex document layouts and tables
85
+
86
+ #### IndexAgent
87
+ - Creates semantic search index from extracted content
88
+ - Splits document into manageable chunks with metadata
89
+ - Generates embeddings for semantic search
90
+ - Provides both chunk-based and full-text search capabilities
91
+ - Includes chunk statistics and visualization
92
+
93
+ #### FieldMapper
94
+ - Implements a two-step field extraction strategy:
95
+ 1. Page-by-page scanning for precise extraction
96
+ 2. Semantic search fallback if no value found
97
+ - Uses document context to improve extraction accuracy
98
+ - Handles multiple potential values with confidence scoring
99
+ - Returns structured JSON responses with value details
100
+
101
+ #### SemanticReasoner
102
+ - Validates and cleans up candidate values
103
+ - Uses domain knowledge to ensure values make sense
104
+ - Can reformat values to standard format
105
+ - Returns `<unresolved>` if value is wrong/missing
106
+
107
+ #### ConfidenceScorer
108
+ - Assigns confidence score (0-1) to each extraction
109
+ - Helps identify which extractions need review
110
+ - Can trigger follow-up queries when confidence is low
111
+
112
+ #### QueryGenerator
113
+ - Generates follow-up questions when confidence is low
114
+ - Creates concise questions (≀12 words)
115
+ - Helps guide system to better extractions
116
+ """)
117
+
118
+ # Traces Page
119
+ elif page == "Traces":
120
+ st.title("Execution Traces")
121
+
122
+ if not st.session_state.execution_history:
123
+ st.info("No execution traces available yet. Run an extraction to see traces here.")
124
+ else:
125
+ # Create a DataFrame from the execution history
126
+ history_data = []
127
+ for record in st.session_state.execution_history:
128
+ history_data.append({
129
+ "filename": record["filename"],
130
+ "datetime": record["datetime"],
131
+ "fields": ", ".join(record.get("fields", [])),
132
+ "logs": record.get("logs", []),
133
+ "results": record.get("results", None)
134
+ })
135
+
136
+ history_df = pd.DataFrame(history_data)
137
+
138
+ # Display column headers
139
+ col1, col2, col3, col4, col5 = st.columns([2, 2, 3, 1, 1])
140
+ with col1:
141
+ st.markdown("**Filename**")
142
+ with col2:
143
+ st.markdown("**Timestamp**")
144
+ with col3:
145
+ st.markdown("**Fields**")
146
+ with col4:
147
+ st.markdown("**Logs**")
148
+ with col5:
149
+ st.markdown("**Results**")
150
+
151
+ st.markdown("---") # Add a separator line
152
+
153
+ # Display the table with download buttons
154
+ for idx, row in history_df.iterrows():
155
+ col1, col2, col3, col4, col5 = st.columns([2, 2, 3, 1, 1])
156
+ with col1:
157
+ st.write(row["filename"])
158
+ with col2:
159
+ st.write(row["datetime"])
160
+ with col3:
161
+ st.write(row["fields"])
162
+ with col4:
163
+ if row["logs"]: # Check if we have any logs
164
+ st.download_button(
165
+ "Download Logs",
166
+ row["logs"], # Use the stored logs
167
+ file_name=f"logs_{row['filename']}_{row['datetime']}.txt",
168
+ key=f"logs_dl_{idx}"
169
+ )
170
+ else:
171
+ st.write("No Logs")
172
+ with col5:
173
+ if row["results"] is not None:
174
+ results_df = pd.DataFrame(row["results"])
175
+ st.download_button(
176
+ "Download Results",
177
+ results_df.to_csv(index=False),
178
+ file_name=f"results_{row['filename']}_{row['datetime']}.csv",
179
+ key=f"results_dl_{idx}"
180
+ )
181
+ else:
182
+ st.write("No Results")
183
+ st.markdown("---") # Add a separator line between rows
184
+
185
+ # Execution Page
186
+ else: # page == "Execution"
187
+ st.title("Deep‑Research PDF Field Extractor (POC)")
188
+
189
+ pdf_file = st.file_uploader("Upload PDF", type=["pdf"])
190
+ fields_str = st.text_input("Fields (comma‑separated)", "Protein Lot, Chain, Residue")
191
+ desc_blob = st.text_area("Field descriptions / rules (YAML, optional)")
192
+
193
+ def flatten_json_response(json_data, fields):
194
+ """Flatten the nested JSON response into a tabular structure with dynamic columns."""
195
+ logger = logging.getLogger(__name__)
196
+ logger.info("Starting flatten_json_response")
197
+ logger.info(f"Input fields: {fields}")
198
+
199
+ # Handle the case where the response is a string
200
+ if isinstance(json_data, str):
201
+ logger.info("Input is a string, attempting to parse as JSON")
202
+ try:
203
+ json_data = json.loads(json_data)
204
+ logger.info("Successfully parsed JSON string")
205
+ except json.JSONDecodeError as e:
206
+ logger.error(f"Failed to parse JSON string: {e}")
207
+ return pd.DataFrame(columns=fields)
208
+
209
+ # If the data is wrapped in an array, get the first item
210
+ if isinstance(json_data, list) and len(json_data) > 0:
211
+ logger.info("Data is wrapped in an array, extracting first item")
212
+ json_data = json_data[0]
213
+
214
+ # If the data is a dictionary with numeric keys, get the first value
215
+ if isinstance(json_data, dict):
216
+ keys = list(json_data.keys())
217
+ logger.info(f"Checking dictionary keys: {keys}")
218
+ # Check if all keys are integers or string representations of integers
219
+ if all(isinstance(k, int) or (isinstance(k, str) and k.isdigit()) for k in keys):
220
+ logger.info("Data has numeric keys, extracting first value")
221
+ first_key = sorted(keys, key=lambda x: int(x) if isinstance(x, str) else x)[0]
222
+ json_data = json_data[first_key]
223
+ logger.info(f"Extracted data from key '{first_key}'")
224
+
225
+ logger.info(f"JSON data keys: {list(json_data.keys()) if isinstance(json_data, dict) else 'Not a dict'}")
226
+
227
+ # Create a list to store rows
228
+ rows = []
229
+
230
+ # Get the length of the first array to determine number of rows
231
+ if isinstance(json_data, dict) and len(json_data) > 0:
232
+ first_field = list(json_data.keys())[0]
233
+ num_rows = len(json_data[first_field]) if isinstance(json_data[first_field], list) else 1
234
+ logger.info(f"Number of rows to process: {num_rows}")
235
+
236
+ # Create a row for each index
237
+ for i in range(num_rows):
238
+ logger.debug(f"Processing row {i}")
239
+ row = {}
240
+ for field in fields:
241
+ if field in json_data and isinstance(json_data[field], list) and i < len(json_data[field]):
242
+ row[field] = json_data[field][i]
243
+ logger.debug(f"Field '{field}' value at index {i}: {json_data[field][i]}")
244
+ else:
245
+ row[field] = None
246
+ logger.debug(f"Field '{field}' not found or index {i} out of bounds")
247
+ rows.append(row)
248
+ else:
249
+ logger.error(f"Unexpected data structure: {type(json_data)}")
250
+ return pd.DataFrame(columns=fields)
251
+
252
+ # Create DataFrame with all requested fields as columns
253
+ df = pd.DataFrame(rows)
254
+ logger.info(f"Created DataFrame with shape: {df.shape}")
255
+ logger.info(f"DataFrame columns: {df.columns.tolist()}")
256
+
257
+ # Ensure columns are in the same order as the fields list
258
+ df = df[fields]
259
+ logger.info(f"Final DataFrame columns after reordering: {df.columns.tolist()}")
260
+
261
+ return df
262
+
263
+ if st.button("Run extraction") and pdf_file:
264
+ field_list = [f.strip() for f in fields_str.split(",") if f.strip()]
265
+ field_descs = yaml.safe_load(desc_blob) if desc_blob.strip() else {}
266
+
267
+ try:
268
+ with st.spinner("Planning …"):
269
+ # quick first-page text preview to give LLM document context
270
+ doc = fitz.open(stream=pdf_file.getvalue(), filetype="pdf") # type: ignore[arg-type]
271
+ preview = "\n".join(page.get_text() for page in doc[:10])[:20000] # first 2 pages, 2k chars
272
+
273
+ planner = Planner()
274
+ plan = planner.build_plan(
275
+ pdf_meta={"filename": pdf_file.name},
276
+ doc_preview=preview,
277
+ fields=field_list,
278
+ field_descs=field_descs,
279
+ )
280
+
281
+ # Add a visual separator
282
+ st.markdown("---")
283
+
284
+ with st.spinner("Executing …"):
285
+ executor = Executor(settings=settings)
286
+ results, logs = executor.run(plan, pdf_file)
287
+
288
+ # Add detailed logging about what executor returned
289
+ logger.info(f"Executor returned results of type: {type(results)}")
290
+ logger.info(f"Results content: {results}")
291
+
292
+ # Check if results is already a DataFrame
293
+ if isinstance(results, pd.DataFrame):
294
+ logger.info(f"Results is already a DataFrame with shape: {results.shape}")
295
+ logger.info(f"DataFrame columns: {results.columns.tolist()}")
296
+ logger.info(f"DataFrame head: {results.head()}")
297
+ df = results
298
+ else:
299
+ logger.info("Results is not a DataFrame, calling flatten_json_response")
300
+ # Process results using flatten_json_response
301
+ df = flatten_json_response(results, field_list)
302
+
303
+ # Log final DataFrame info
304
+ logger.info(f"Final DataFrame shape: {df.shape}")
305
+ logger.info(f"Final DataFrame columns: {df.columns.tolist()}")
306
+ if not df.empty:
307
+ logger.info(f"Final DataFrame sample: {df.head()}")
308
+
309
+ # Store execution in history
310
+ execution_record = {
311
+ "filename": pdf_file.name,
312
+ "datetime": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
313
+ "fields": field_list,
314
+ "logs": log_capture.get_logs(), # Store the actual logs
315
+ "results": df.to_dict() if not df.empty else None
316
+ }
317
+ st.session_state.execution_history.append(execution_record)
318
+ log_capture.clear() # Clear logs after storing them
319
+
320
+ # ----------------- UI: show execution tree -----------------
321
+ st.subheader("Execution trace")
322
+ for log in logs:
323
+ indent = "&nbsp;" * 4 * log["depth"]
324
+ # Add error indicator if there was an error
325
+ error_indicator = "❌ " if log.get("error") else "βœ“ "
326
+ # Use a fixed preview text instead of the result
327
+ with st.expander(f"{indent}{error_indicator}{log['tool']} – Click to view result"):
328
+ st.markdown(f"**Args**: `{log['args']}`", unsafe_allow_html=True)
329
+ if log.get("error"):
330
+ st.error(f"Error: {log['error']}")
331
+
332
+ # Special handling for IndexAgent output
333
+ if log['tool'] == "IndexAgent" and isinstance(log["result"], dict):
334
+ # Display chunk statistics if available
335
+ if "chunk_stats" in log["result"]:
336
+ st.markdown("### Chunk Statistics")
337
+ # Create a DataFrame for better visualization
338
+ stats_df = pd.DataFrame(log["result"]["chunk_stats"])
339
+ st.dataframe(stats_df)
340
+
341
+ # Add summary statistics
342
+ st.markdown("### Summary")
343
+ st.markdown(f"""
344
+ - Total chunks: {len(stats_df)}
345
+ - Average chunk length: {stats_df['length'].mean():.0f} characters
346
+ - Shortest chunk: {stats_df['length'].min()} characters
347
+ - Longest chunk: {stats_df['length'].max()} characters
348
+ """)
349
+
350
+ # Add a bar chart of chunk lengths
351
+ st.markdown("### Chunk Length Distribution")
352
+ st.bar_chart(stats_df.set_index('chunk_number')['length'])
353
+ else:
354
+ st.code(log["result"])
355
+
356
+ if not df.empty:
357
+ st.success("Done βœ“")
358
+ st.dataframe(df)
359
+ st.download_button("Download CSV", df.to_csv(index=False), "results.csv")
360
+ else:
361
+ st.warning("No results were extracted. Check the execution trace for errors.")
362
+ except Exception as e:
363
+ logging.exception("App error:")
364
+ st.error(f"An error occurred: {e}")
src/config/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
src/config/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (173 Bytes). View file
 
src/config/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (161 Bytes). View file
 
src/config/__pycache__/configurations.cpython-313.pyc ADDED
Binary file (7.13 kB). View file
 
src/config/__pycache__/settings.cpython-312.pyc ADDED
Binary file (1.27 kB). View file
 
src/config/__pycache__/settings.cpython-313.pyc ADDED
Binary file (1.27 kB). View file
 
src/config/field_rules.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Chain:
2
+ description: Heavy vs Light chain based on Seq Loc prefix.
3
+ rules:
4
+ starts_with: {L: "Light", H: "Heavy"}
src/config/prompts.yaml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ planner: |
2
+ You are "Doc-to-Record Planner v1" – an expert at designing multi-step
3
+ extraction pipelines that convert an arbitrary document into a flat record
4
+ of user-requested fields.
5
+
6
+ You will be given:
7
+ – doc_preview: a few kB of raw text from the uploaded document (may include table HTML).
8
+ – fields: the list of field names the user wants extracted.
9
+ – pdf_meta / field_descriptions for extra context.
10
+
11
+ Available tools (use exactly these names in the JSON):
12
+ PDFAgent β†’ extracts raw text from the full PDF.
13
+ TableAgent β†’ calls Azure Document Intelligence to get HTML tables.
14
+ FieldMapper β†’ maps one field name to a candidate value.
15
+
16
+ Control-flow helper:
17
+ ForEachField – loops over every requested field and executes the nested "loop" array.
18
+
19
+ Output JSON **only** with this schema (no markdown):
20
+ {
21
+ "fields": [<same list you received>],
22
+ "steps": [
23
+ {"tool": "PDFAgent", "args": {}},
24
+ {"tool": "TableAgent", "args": {}},
25
+ {"tool": "ForEachField",
26
+ "loop": [
27
+ {"tool": "FieldMapper", "args": {"field": "$field"}}
28
+ ]}
29
+ ]
30
+ }
31
+
32
+ Always include PDFAgent and TableAgent first, then the ForEachField loop. Keep plans short and deterministic.
33
+
34
+ field_mapper: |
35
+ You are "FieldMapper v1" – a precision extractor.
36
+ Given:
37
+ β€’ field (target field name)
38
+ β€’ context (snippet of raw PDF text or table row)
39
+ Return **only** the best candidate value (no extra words).
40
+
41
+ semantic_reasoner: |
42
+ You are "Semantic Reasoner v1".
43
+ Validate the candidate value for a field using domain knowledge and the surrounding context.
44
+ If the candidate is obviously wrong / absent output <unresolved FIELDNAME> (same token as placeholder).
45
+ Otherwise output a cleaned, final value – no explanation text.
46
+
47
+ confidence_scorer: |
48
+ You are "Confidence Scorer". For the given field and candidate value assign a confidence between 0 and 1.
49
+ Output **only** the float.
50
+
51
+ query_generator: |
52
+ You are "Follow-up Query Generator". The previous candidate for a field was low-confidence.
53
+ Formulate a concise follow-up question (<=12 words) that, when answered, would help identify the field value.
src/config/settings.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import Field
2
+ from pydantic_settings import BaseSettings, SettingsConfigDict
3
+
4
+ class Settings(BaseSettings):
5
+ OPENAI_API_KEY: str = Field("", env="OPENAI_API_KEY")
6
+ AZURE_DI_ENDPOINT: str = Field("", env="AZURE_DI_ENDPOINT")
7
+ AZURE_DI_KEY: str = Field("", env="AZURE_DI_KEY")
8
+
9
+ # Azure OpenAI
10
+ AZURE_OPENAI_ENDPOINT: str = Field("", env="AZURE_OPENAI_ENDPOINT")
11
+ AZURE_OPENAI_DEPLOYMENT: str = Field("", env="AZURE_OPENAI_DEPLOYMENT")
12
+ AZURE_OPENAI_API_VERSION: str = Field("2024-02-15-preview", env="AZURE_OPENAI_API_VERSION")
13
+ AZURE_OPENAI_API_KEY: str = Field("", env="AZURE_OPENAI_API_KEY")
14
+ AZURE_OPENAI_EMBEDDING_MODEL: str = Field("text-embedding-3-small", env="AZURE_OPENAI_EMBEDDING_MODEL")
15
+
16
+ model_config: SettingsConfigDict = {"env_file": ".env"}
17
+
18
+ settings = Settings()
src/docker/Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ---------- builder -------------
2
+ FROM python:3.11-slim AS builder
3
+ RUN pip install --no-cache-dir uv
4
+
5
+ WORKDIR /app
6
+ COPY pyproject.toml ./
7
+ RUN uv pip install -r <(uv pip compile --quiet) \
8
+ && uv pip freeze > /installed.txt # layer cache
9
+
10
+ COPY . .
11
+
12
+ # ---------- runtime -------------
13
+ FROM python:3.11-slim
14
+ ENV PYTHONUNBUFFERED=1
15
+ WORKDIR /app
16
+ COPY --from=builder /installed.txt /installed.txt
17
+ RUN xargs -a /installed.txt pip install --no-cache-dir
18
+
19
+ COPY . .
20
+
21
+ ENTRYPOINT ["bash", "docker/entrypoint.sh"]
src/docker/entrypoint.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ source .env # load secrets (mounted at run)
3
+ streamlit run app.py --server.port ${PORT:-8501}
src/main.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ def main():
2
+ print("Hello from filetorecord!")
3
+
4
+
5
+ if __name__ == "__main__":
6
+ main()
src/orchestrator/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
src/orchestrator/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (179 Bytes). View file