Spaces:
Sleeping
Sleeping
v2
Browse files- app.py +321 -34
- requirements.txt +40 -5
app.py
CHANGED
@@ -1,42 +1,329 @@
|
|
1 |
-
import os
|
2 |
import gradio as gr
|
|
|
3 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
print("
|
10 |
-
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
-
#
|
14 |
-
summarizer =
|
15 |
-
task="summarization",
|
16 |
-
model=model,
|
17 |
-
tokenizer=tokenizer,
|
18 |
-
device=-1 # GPU を使いたい場合は device=0 に変更
|
19 |
-
)
|
20 |
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
do_sample=False # ビームサーチによる決定的要約
|
29 |
-
)
|
30 |
-
return result[0]["summary_text"]
|
31 |
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
if __name__ == "__main__":
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
+
import torch
|
3 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
4 |
+
import PyPDF2
|
5 |
+
import requests
|
6 |
+
from bs4 import BeautifulSoup
|
7 |
+
import re
|
8 |
+
import warnings
|
9 |
+
warnings.filterwarnings("ignore")
|
10 |
|
11 |
+
class TextSummarizer:
|
12 |
+
def __init__(self):
|
13 |
+
# GPUが利用可能かチェック
|
14 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
15 |
+
print(f"使用デバイス: {self.device}")
|
16 |
+
|
17 |
+
# 日本語対応の要約モデルを初期化
|
18 |
+
# 軽量で高性能なモデルを使用
|
19 |
+
model_name = "facebook/bart-large-cnn" # 英語用
|
20 |
+
# 日本語の場合は "rinna/japanese-gpt2-medium" や "cyberagent/open-calm-7b" などを検討
|
21 |
+
|
22 |
+
try:
|
23 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
24 |
+
self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(self.device)
|
25 |
+
self.summarizer = pipeline(
|
26 |
+
"summarization",
|
27 |
+
model=self.model,
|
28 |
+
tokenizer=self.tokenizer,
|
29 |
+
device=0 if self.device == "cuda" else -1
|
30 |
+
)
|
31 |
+
print("モデルの読み込みが完了しました")
|
32 |
+
except Exception as e:
|
33 |
+
print(f"モデル読み込みエラー: {e}")
|
34 |
+
# フォールバック用の軽量モデル
|
35 |
+
self.summarizer = pipeline("summarization", device=0 if self.device == "cuda" else -1)
|
36 |
+
|
37 |
+
def clean_text(self, text):
|
38 |
+
"""テキストの前処理"""
|
39 |
+
# 不要な文字や改行を整理
|
40 |
+
text = re.sub(r'\n+', '\n', text)
|
41 |
+
text = re.sub(r'\s+', ' ', text)
|
42 |
+
text = text.strip()
|
43 |
+
return text
|
44 |
+
|
45 |
+
def chunk_text(self, text, max_length=1000):
|
46 |
+
"""長いテキストをチャンクに分割"""
|
47 |
+
sentences = text.split('.')
|
48 |
+
chunks = []
|
49 |
+
current_chunk = ""
|
50 |
+
|
51 |
+
for sentence in sentences:
|
52 |
+
if len(current_chunk + sentence) < max_length:
|
53 |
+
current_chunk += sentence + "."
|
54 |
+
else:
|
55 |
+
if current_chunk:
|
56 |
+
chunks.append(current_chunk.strip())
|
57 |
+
current_chunk = sentence + "."
|
58 |
+
|
59 |
+
if current_chunk:
|
60 |
+
chunks.append(current_chunk.strip())
|
61 |
+
|
62 |
+
return chunks
|
63 |
+
|
64 |
+
def summarize_text(self, text, max_length=150, min_length=50):
|
65 |
+
"""テキストを要約"""
|
66 |
+
try:
|
67 |
+
cleaned_text = self.clean_text(text)
|
68 |
+
|
69 |
+
if len(cleaned_text) < 100:
|
70 |
+
return "テキストが短すぎるため、要約できません。"
|
71 |
+
|
72 |
+
# テキストが長い場合はチャンクに分割
|
73 |
+
if len(cleaned_text) > 1000:
|
74 |
+
chunks = self.chunk_text(cleaned_text)
|
75 |
+
summaries = []
|
76 |
+
|
77 |
+
for chunk in chunks:
|
78 |
+
try:
|
79 |
+
result = self.summarizer(
|
80 |
+
chunk,
|
81 |
+
max_length=max_length,
|
82 |
+
min_length=min_length,
|
83 |
+
do_sample=False
|
84 |
+
)
|
85 |
+
summaries.append(result[0]['summary_text'])
|
86 |
+
except Exception as e:
|
87 |
+
print(f"チャンク要約エラー: {e}")
|
88 |
+
continue
|
89 |
+
|
90 |
+
# チャンクの要約を統合
|
91 |
+
combined_summary = " ".join(summaries)
|
92 |
+
if len(combined_summary) > max_length * 2:
|
93 |
+
# 再度要約
|
94 |
+
final_result = self.summarizer(
|
95 |
+
combined_summary,
|
96 |
+
max_length=max_length,
|
97 |
+
min_length=min_length,
|
98 |
+
do_sample=False
|
99 |
+
)
|
100 |
+
return final_result[0]['summary_text']
|
101 |
+
else:
|
102 |
+
return combined_summary
|
103 |
+
else:
|
104 |
+
result = self.summarizer(
|
105 |
+
cleaned_text,
|
106 |
+
max_length=max_length,
|
107 |
+
min_length=min_length,
|
108 |
+
do_sample=False
|
109 |
+
)
|
110 |
+
return result[0]['summary_text']
|
111 |
+
|
112 |
+
except Exception as e:
|
113 |
+
return f"要約処理でエラーが発生しました: {str(e)}"
|
114 |
+
|
115 |
+
def structure_summary(self, summary_text):
|
116 |
+
"""要約を構造化"""
|
117 |
+
# 簡単な構造化ロジック(実際のプロジェクトではより高度な処理が��要)
|
118 |
+
sentences = summary_text.split('.')
|
119 |
+
|
120 |
+
structured_output = "## 📋 要約結果\n\n"
|
121 |
+
|
122 |
+
if len(sentences) >= 3:
|
123 |
+
structured_output += "### 🎯 主要ポイント\n"
|
124 |
+
structured_output += f"- {sentences[0].strip()}\n\n"
|
125 |
+
|
126 |
+
structured_output += "### 📊 詳細内容\n"
|
127 |
+
for i, sentence in enumerate(sentences[1:-1], 1):
|
128 |
+
if sentence.strip():
|
129 |
+
structured_output += f"{i}. {sentence.strip()}\n"
|
130 |
+
|
131 |
+
if sentences[-1].strip():
|
132 |
+
structured_output += f"\n### 💡 結論\n"
|
133 |
+
structured_output += f"- {sentences[-1].strip()}\n"
|
134 |
+
else:
|
135 |
+
structured_output += f"### 📄 要約内容\n{summary_text}\n"
|
136 |
+
|
137 |
+
return structured_output
|
138 |
+
|
139 |
+
def extract_text_from_pdf(self, pdf_file):
|
140 |
+
"""PDFからテキストを抽出"""
|
141 |
+
try:
|
142 |
+
reader = PyPDF2.PdfReader(pdf_file)
|
143 |
+
text = ""
|
144 |
+
for page in reader.pages:
|
145 |
+
text += page.extract_text() + "\n"
|
146 |
+
return text
|
147 |
+
except Exception as e:
|
148 |
+
return f"PDFの読み込みでエラーが発生しました: {str(e)}"
|
149 |
+
|
150 |
+
def extract_text_from_url(self, url):
|
151 |
+
"""Webサイトからテキストを抽出"""
|
152 |
+
try:
|
153 |
+
headers = {
|
154 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
155 |
+
}
|
156 |
+
response = requests.get(url, headers=headers, timeout=10)
|
157 |
+
response.encoding = response.apparent_encoding
|
158 |
+
|
159 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
160 |
+
|
161 |
+
# 不要なタグを削除
|
162 |
+
for tag in soup(['script', 'style', 'nav', 'header', 'footer']):
|
163 |
+
tag.decompose()
|
164 |
+
|
165 |
+
# テキストを抽出
|
166 |
+
text = soup.get_text()
|
167 |
+
return self.clean_text(text)
|
168 |
+
|
169 |
+
except Exception as e:
|
170 |
+
return f"Webサイトの読み込みでエラーが発生しました: {str(e)}"
|
171 |
|
172 |
+
# グローバルインスタンス
|
173 |
+
summarizer = TextSummarizer()
|
|
|
|
|
|
|
|
|
|
|
174 |
|
175 |
+
def process_text_input(text, max_length, min_length):
|
176 |
+
"""テキスト入力の処理"""
|
177 |
+
if not text.strip():
|
178 |
+
return "テキストを入力してください。"
|
179 |
+
|
180 |
+
summary = summarizer.summarize_text(text, max_length, min_length)
|
181 |
+
return summarizer.structure_summary(summary)
|
|
|
|
|
|
|
182 |
|
183 |
+
def process_pdf_input(pdf_file, max_length, min_length):
|
184 |
+
"""PDF入力の処理"""
|
185 |
+
if pdf_file is None:
|
186 |
+
return "PDFファイルを選択してください。"
|
187 |
+
|
188 |
+
text = summarizer.extract_text_from_pdf(pdf_file)
|
189 |
+
if text.startswith("PDFの読み込みで"):
|
190 |
+
return text
|
191 |
+
|
192 |
+
summary = summarizer.summarize_text(text, max_length, min_length)
|
193 |
+
return summarizer.structure_summary(summary)
|
194 |
+
|
195 |
+
def process_url_input(url, max_length, min_length):
|
196 |
+
"""URL入力の処理"""
|
197 |
+
if not url.strip():
|
198 |
+
return "URLを入力してください。"
|
199 |
+
|
200 |
+
if not url.startswith(('http://', 'https://')):
|
201 |
+
url = 'https://' + url
|
202 |
+
|
203 |
+
text = summarizer.extract_text_from_url(url)
|
204 |
+
if text.startswith("Webサイトの読み込みで"):
|
205 |
+
return text
|
206 |
+
|
207 |
+
summary = summarizer.summarize_text(text, max_length, min_length)
|
208 |
+
return summarizer.structure_summary(summary)
|
209 |
+
|
210 |
+
# Gradioインターフェース作成
|
211 |
+
def create_interface():
|
212 |
+
with gr.Blocks(title="🤖 ローカルLLM テキスト要約ツール", theme=gr.themes.Soft()) as app:
|
213 |
+
gr.Markdown("""
|
214 |
+
# 🤖 ローカルLLM テキスト要約ツール
|
215 |
+
|
216 |
+
このツールは、ローカルで動作するLLMを使用してテキストを要約し、構造化された形式で出力します。
|
217 |
+
|
218 |
+
## 📝 対応入力形式
|
219 |
+
- **テキスト直接入力**
|
220 |
+
- **PDFファイル**
|
221 |
+
- **Webサイト URL**
|
222 |
+
""")
|
223 |
+
|
224 |
+
# 要約設定
|
225 |
+
with gr.Row():
|
226 |
+
max_length = gr.Slider(
|
227 |
+
minimum=50, maximum=500, value=150, step=10,
|
228 |
+
label="最大要約長", info="要約の最大文字数"
|
229 |
+
)
|
230 |
+
min_length = gr.Slider(
|
231 |
+
minimum=20, maximum=200, value=50, step=10,
|
232 |
+
label="最小要約長", info="要約の最小文字数"
|
233 |
+
)
|
234 |
+
|
235 |
+
# タブインターフェース
|
236 |
+
with gr.Tabs():
|
237 |
+
# テキスト入力タブ
|
238 |
+
with gr.TabItem("📝 テキスト入力"):
|
239 |
+
with gr.Row():
|
240 |
+
with gr.Column():
|
241 |
+
text_input = gr.Textbox(
|
242 |
+
lines=10,
|
243 |
+
placeholder="要約したいテキストを入力してください...",
|
244 |
+
label="入力テキスト"
|
245 |
+
)
|
246 |
+
text_btn = gr.Button("🔍 要約実行", variant="primary")
|
247 |
+
|
248 |
+
with gr.Column():
|
249 |
+
text_output = gr.Markdown(label="要約結果")
|
250 |
+
|
251 |
+
text_btn.click(
|
252 |
+
process_text_input,
|
253 |
+
inputs=[text_input, max_length, min_length],
|
254 |
+
outputs=text_output
|
255 |
+
)
|
256 |
+
|
257 |
+
# PDF入力タブ
|
258 |
+
with gr.TabItem("📄 PDF入力"):
|
259 |
+
with gr.Row():
|
260 |
+
with gr.Column():
|
261 |
+
pdf_input = gr.File(
|
262 |
+
file_types=[".pdf"],
|
263 |
+
label="PDFファイルを選択"
|
264 |
+
)
|
265 |
+
pdf_btn = gr.Button("🔍 PDF要約実行", variant="primary")
|
266 |
+
|
267 |
+
with gr.Column():
|
268 |
+
pdf_output = gr.Markdown(label="要約結果")
|
269 |
+
|
270 |
+
pdf_btn.click(
|
271 |
+
process_pdf_input,
|
272 |
+
inputs=[pdf_input, max_length, min_length],
|
273 |
+
outputs=pdf_output
|
274 |
+
)
|
275 |
+
|
276 |
+
# URL入力タブ
|
277 |
+
with gr.TabItem("🌐 Website URL"):
|
278 |
+
with gr.Row():
|
279 |
+
with gr.Column():
|
280 |
+
url_input = gr.Textbox(
|
281 |
+
placeholder="https://example.com",
|
282 |
+
label="ウェブサイトURL"
|
283 |
+
)
|
284 |
+
url_btn = gr.Button("🔍 Web要約実行", variant="primary")
|
285 |
+
|
286 |
+
with gr.Column():
|
287 |
+
url_output = gr.Markdown(label="要約結果")
|
288 |
+
|
289 |
+
url_btn.click(
|
290 |
+
process_url_input,
|
291 |
+
inputs=[url_input, max_length, min_length],
|
292 |
+
outputs=url_output
|
293 |
+
)
|
294 |
+
|
295 |
+
# 使用方法
|
296 |
+
gr.Markdown("""
|
297 |
+
## 🔧 使用方法
|
298 |
+
|
299 |
+
1. **要約設定**: 最大・最小要約長を調整
|
300 |
+
2. **入力方法選択**: テキスト直接入力、PDFアップロード、URL入力から選択
|
301 |
+
3. **実行**: 対応する実行ボタンをクリック
|
302 |
+
4. **結果確認**: 構造化された要約結果を確認
|
303 |
+
|
304 |
+
## ⚙️ 技術仕様
|
305 |
+
- **モデル**: Facebook BART (ローカル実行)
|
306 |
+
- **GPU加速**: CUDA対応
|
307 |
+
- **出力形式**: 構造化Markdown
|
308 |
+
""")
|
309 |
+
|
310 |
+
return app
|
311 |
|
312 |
if __name__ == "__main__":
|
313 |
+
# 必要なライブラリのインストールメッセージ
|
314 |
+
print("""
|
315 |
+
必要なライブラリをインストールしてください:
|
316 |
+
|
317 |
+
pip install torch transformers gradio PyPDF2 requests beautifulsoup4
|
318 |
+
|
319 |
+
GPU使用の場合は適切なPyTorchバージョンをインストールしてください。
|
320 |
+
""")
|
321 |
+
|
322 |
+
# アプリケーション起動
|
323 |
+
app = create_interface()
|
324 |
+
app.launch(
|
325 |
+
server_name="0.0.0.0", # 外部アクセス許可
|
326 |
+
server_port=7860,
|
327 |
+
share=True, # パブリックURL生成
|
328 |
+
debug=True
|
329 |
+
)
|
requirements.txt
CHANGED
@@ -1,5 +1,40 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Core ML Libraries
|
2 |
+
torch>=2.0.0
|
3 |
+
transformers>=4.30.0
|
4 |
+
tokenizers>=0.13.0
|
5 |
+
|
6 |
+
# Web Interface
|
7 |
+
gradio>=3.35.0
|
8 |
+
|
9 |
+
# PDF Processing
|
10 |
+
PyPDF2>=3.0.0
|
11 |
+
|
12 |
+
# Web Scraping
|
13 |
+
requests>=2.31.0
|
14 |
+
beautifulsoup4>=4.12.0
|
15 |
+
|
16 |
+
# Data Processing
|
17 |
+
numpy>=1.24.0
|
18 |
+
pandas>=2.0.0
|
19 |
+
|
20 |
+
# Text Processing
|
21 |
+
nltk>=3.8.0
|
22 |
+
regex>=2023.6.3
|
23 |
+
|
24 |
+
# Optional: Japanese Text Processing
|
25 |
+
# fugashi>=1.3.0
|
26 |
+
# unidic-lite>=1.0.8
|
27 |
+
# mecab-python3>=1.0.6
|
28 |
+
|
29 |
+
# Optional: GPU Support (uncomment if using CUDA)
|
30 |
+
# torch-audio>=2.0.0+cu118 --extra-index-url https://download.pytorch.org/whl/cu118
|
31 |
+
# torchvision>=0.15.0+cu118 --extra-index-url https://download.pytorch.org/whl/cu118
|
32 |
+
|
33 |
+
# Development Tools (optional)
|
34 |
+
# jupyter>=1.0.0
|
35 |
+
# matplotlib>=3.7.0
|
36 |
+
# seaborn>=0.12.0
|
37 |
+
|
38 |
+
# Security
|
39 |
+
certifi>=2023.5.7
|
40 |
+
urllib3>=2.0.3
|