adil9858 commited on
Commit
0b887a8
Β·
verified Β·
1 Parent(s): 46d9119

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +182 -0
app.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from openai import OpenAI
3
+ import base64
4
+ from PIL import Image
5
+ import io
6
+ import fitz # PyMuPDF
7
+ import tempfile
8
+ import os
9
+
10
+ # --- HELPER FUNCTIONS ---
11
+ def convert_pdf_to_images(pdf_file):
12
+ """Convert PDF to list of PIL Images"""
13
+ images = []
14
+ try:
15
+ # Save uploaded file to a temporary file
16
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
17
+ tmp_file.write(pdf_file)
18
+ tmp_file_path = tmp_file.name
19
+
20
+ # Open the PDF file
21
+ pdf_document = fitz.open(tmp_file_path)
22
+
23
+ # Iterate through each page
24
+ for page_num in range(len(pdf_document)):
25
+ page = pdf_document.load_page(page_num)
26
+ pix = page.get_pixmap()
27
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
28
+ images.append(img)
29
+
30
+ # Clean up
31
+ pdf_document.close()
32
+ os.unlink(tmp_file_path)
33
+
34
+ except Exception as e:
35
+ raise gr.Error(f"Error converting PDF: {e}")
36
+ return images
37
+
38
+ def image_to_base64(image):
39
+ """Convert PIL Image to base64 string"""
40
+ with io.BytesIO() as buffer:
41
+ image.save(buffer, format="PNG")
42
+ return base64.b64encode(buffer.getvalue()).decode("utf-8")
43
+
44
+ def generate_summary(extracted_texts, api_key):
45
+ """Generate a comprehensive summary of all extracted texts"""
46
+ try:
47
+ client = OpenAI(
48
+ base_url="https://openrouter.ai/api/v1",
49
+ api_key=api_key
50
+ )
51
+
52
+ summary_prompt = f"""
53
+ You are an expert document analyst. Below are the extracted contents from multiple pages of a document.
54
+ Please provide a comprehensive, detailed summary that:
55
+ 1. Organizes all key information logically
56
+ 2. Identifies relationships between data points
57
+ 3. Highlights important figures, dates, names
58
+ 4. Presents the information in a clear, structured format
59
+
60
+ Extracted contents from pages:
61
+ {extracted_texts}
62
+
63
+ Comprehensive Summary:
64
+ """
65
+
66
+ response = client.chat.completions.create(
67
+ model="opengvlab/internvl3-14b:free",
68
+ messages=[
69
+ {"role": "system", "content": "You are Dalton, an expert in analyzing and summarizing document contents."},
70
+ {"role": "user", "content": summary_prompt}
71
+ ],
72
+ max_tokens=2048
73
+ )
74
+
75
+ return response.choices[0].message.content
76
+
77
+ except Exception as e:
78
+ raise gr.Error(f"Error generating summary: {e}")
79
+
80
+ def analyze_document(api_key, user_prompt, uploaded_file):
81
+ """Main processing function"""
82
+ if not api_key:
83
+ raise gr.Error("Please enter your OpenRouter API key")
84
+
85
+ if uploaded_file is None:
86
+ raise gr.Error("Please upload a document")
87
+
88
+ images_to_analyze = []
89
+ file_ext = os.path.splitext(uploaded_file.name)[1].lower()
90
+
91
+ # Handle PDF or image
92
+ if file_ext == '.pdf':
93
+ with open(uploaded_file.name, "rb") as f:
94
+ pdf_data = f.read()
95
+ pdf_images = convert_pdf_to_images(pdf_data)
96
+ images_to_analyze = pdf_images # For simplicity, using all pages
97
+ else:
98
+ image = Image.open(uploaded_file.name)
99
+ images_to_analyze = [image]
100
+
101
+ # Process each image
102
+ all_results = []
103
+ extracted_texts = []
104
+
105
+ for idx, image in enumerate(images_to_analyze, 1):
106
+ try:
107
+ client = OpenAI(
108
+ base_url="https://openrouter.ai/api/v1",
109
+ api_key=api_key
110
+ )
111
+
112
+ image_base64 = image_to_base64(image)
113
+
114
+ response = client.chat.completions.create(
115
+ model="opengvlab/internvl3-14b:free",
116
+ messages=[
117
+ {"role": "system", "content": "You are Dalton, an expert in understanding images that can analyze images and provide detailed descriptions."},
118
+ {"role": "user", "content": [
119
+ {"type": "text", "text": user_prompt},
120
+ {"type": "image_url", "image_url": {
121
+ "url": f"data:image/png;base64,{image_base64}"
122
+ }}
123
+ ]}
124
+ ],
125
+ max_tokens=1024
126
+ )
127
+
128
+ result = response.choices[0].message.content
129
+ extracted_texts.append(f"=== Page {idx} ===\n{result}\n")
130
+ all_results.append(f"πŸ“„ Page {idx} Result:\n{result}\n---\n")
131
+
132
+ except Exception as e:
133
+ raise gr.Error(f"Error analyzing page {idx}: {e}")
134
+
135
+ # Generate summary if multiple pages
136
+ final_output = "\n".join(all_results)
137
+
138
+ if len(extracted_texts) > 1:
139
+ summary = generate_summary("\n".join(extracted_texts), api_key)
140
+ final_output += f"\nπŸ“ Comprehensive Summary:\n{summary}"
141
+
142
+ return final_output
143
+
144
+ # --- GRADIO INTERFACE ---
145
+ with gr.Blocks(title="DocSum - Document Summarizer", theme=gr.themes.Soft()) as demo:
146
+ gr.Markdown("# 🧾 DocSum")
147
+ gr.Markdown("Document Summarizer Powered by VLM β€’ Developed by [Koshur AI](https://koshurai.com)")
148
+
149
+ with gr.Row():
150
+ api_key = gr.Textbox(
151
+ label="πŸ”‘ OpenRouter API Key",
152
+ type="password",
153
+ placeholder="Enter your OpenRouter API key"
154
+ )
155
+ user_prompt = gr.Textbox(
156
+ label="πŸ“ Enter Your Prompt",
157
+ value="Extract all content structurally",
158
+ placeholder="What would you like to extract?"
159
+ )
160
+
161
+ uploaded_file = gr.File(
162
+ label="Upload Document (PDF/Image)",
163
+ file_types=[".pdf", ".jpg", ".jpeg", ".png"]
164
+ )
165
+
166
+ submit_btn = gr.Button("πŸ” Analyze Document", variant="primary")
167
+
168
+ output = gr.Textbox(
169
+ label="Analysis Results",
170
+ interactive=False,
171
+ lines=20,
172
+ max_lines=50
173
+ )
174
+
175
+ submit_btn.click(
176
+ fn=analyze_document,
177
+ inputs=[api_key, user_prompt, uploaded_file],
178
+ outputs=output
179
+ )
180
+
181
+ if __name__ == "__main__":
182
+ demo.launch()