Futyn-Maker commited on
Commit
2602ab3
Β·
1 Parent(s): ba1b3f0

Add the app

Browse files
Files changed (3) hide show
  1. README.md +1 -1
  2. app.py +152 -0
  3. requirements.txt +111 -0
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: Ocr Notebooks App
3
- emoji: πŸ”₯
4
  colorFrom: gray
5
  colorTo: indigo
6
  sdk: gradio
 
1
  ---
2
  title: Ocr Notebooks App
3
+ emoji: πŸ“„
4
  colorFrom: gray
5
  colorTo: indigo
6
  sdk: gradio
app.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import cv2
4
+ import torch
5
+ import gradio as gr
6
+ from PIL import Image
7
+ from transformers import TrOCRProcessor, VisionEncoderDecoderModel
8
+ from kraken import blla, binarization
9
+
10
+ # Π˜Π½ΠΈΡ†ΠΈΠ°Π»ΠΈΠ·Π°Ρ†ΠΈΡ ΠΌΠΎΠ΄Π΅Π»ΠΈ ΠΈ процСссора
11
+ print("Π—Π°Π³Ρ€ΡƒΠ·ΠΊΠ° ΠΌΠΎΠ΄Π΅Π»ΠΈ OCR...")
12
+ model_name = "Futyn-Maker/trocr-base-ru-notebooks"
13
+ processor = TrOCRProcessor.from_pretrained(model_name)
14
+ model = VisionEncoderDecoderModel.from_pretrained(model_name)
15
+
16
+ # ΠŸΡ€ΠΎΠ²Π΅Ρ€ΠΊΠ° доступности GPU
17
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
+ model.to(device)
19
+ print(f"ИспользованиС устройства: {device}")
20
+
21
+ def segment_image(image):
22
+ """
23
+ Π‘Π΅Π³ΠΌΠ΅Π½Ρ‚ΠΈΡ€ΡƒΠ΅Ρ‚ ΠΈΠ·ΠΎΠ±Ρ€Π°ΠΆΠ΅Π½ΠΈΠ΅ Π½Π° строки с ΠΏΠΎΠΌΠΎΡ‰ΡŒΡŽ Kraken
24
+ """
25
+ # ΠšΠΎΠ½Π²Π΅Ρ€Ρ‚Π°Ρ†ΠΈΡ Π² Π±ΠΈΠ½Π°Ρ€Π½ΠΎΠ΅ ΠΈΠ·ΠΎΠ±Ρ€Π°ΠΆΠ΅Π½ΠΈΠ΅
26
+ bw_img = binarization.nlbin(image, threshold=0.5, escale=2.0, border=0.1, high=0.9)
27
+
28
+ # БСгмСнтация Π½Π° строки
29
+ lines = blla.segment(bw_img, text_direction='horizontal-lr')
30
+
31
+ # Π‘ΠΎΡ€Ρ‚ΠΈΡ€ΠΎΠ²ΠΊΠ° ΠΈ объСдинСниС Π±Π»ΠΈΠ·ΠΊΠΈΡ… строк
32
+ sorted_lines = sorted(lines.lines, key=lambda line: line.baseline[0][1]) # Π‘ΠΎΡ€Ρ‚ΠΈΡ€ΠΎΠ²ΠΊΠ° ΠΏΠΎ y-ΠΊΠΎΠΎΡ€Π΄ΠΈΠ½Π°Ρ‚Π΅
33
+ merged_lines = []
34
+
35
+ if sorted_lines:
36
+ current_line = sorted_lines[0]
37
+ for next_line in sorted_lines[1:]:
38
+ current_y = current_line.baseline[0][1]
39
+ next_y = next_line.baseline[0][1]
40
+
41
+ if abs(next_y - current_y) < 15:
42
+ current_line.baseline.extend(next_line.baseline)
43
+ else:
44
+ merged_lines.append(current_line)
45
+ current_line = next_line
46
+ merged_lines.append(current_line)
47
+ else:
48
+ merged_lines = sorted_lines
49
+
50
+ # Π˜Π·Π²Π»Π΅Ρ‡Π΅Π½ΠΈΠ΅ областСй строк
51
+ line_images = []
52
+ for line in merged_lines:
53
+ baseline = np.array(line.baseline)
54
+ x0 = int(np.min(baseline[:, 0])) # Минимальная x-ΠΊΠΎΠΎΡ€Π΄ΠΈΠ½Π°Ρ‚Π°
55
+ y0 = int(np.min(baseline[:, 1])) # Минимальная y-ΠΊΠΎΠΎΡ€Π΄ΠΈΠ½Π°Ρ‚Π°
56
+ x1 = int(np.max(baseline[:, 0])) # Максимальная x-ΠΊΠΎΠΎΡ€Π΄ΠΈΠ½Π°Ρ‚Π°
57
+ y1 = int(np.max(baseline[:, 1])) # Максимальная y-ΠΊΠΎΠΎΡ€Π΄ΠΈΠ½Π°Ρ‚Π°
58
+
59
+ # Π”ΠΎΠ±Π°Π²Π»Π΅Π½ΠΈΠ΅ отступа для Π»ΡƒΡ‡ΡˆΠ΅Π³ΠΎ распознавания
60
+ padding = 30
61
+ y0 = max(0, y0 - padding)
62
+ y1 = min(image.height, y1 + padding)
63
+
64
+ # Π’Ρ‹Ρ€Π΅Π·Π°Π΅ΠΌ ΠΎΠ±Π»Π°ΡΡ‚ΡŒ строки
65
+ line_image = image.crop((x0, y0, x1, y1))
66
+ line_images.append(line_image)
67
+
68
+ return line_images
69
+
70
+ def recognize_text(image):
71
+ """
72
+ РаспознаСт тСкст Π½Π° ΠΈΠ·ΠΎΠ±Ρ€Π°ΠΆΠ΅Π½ΠΈΠΈ, сСгмСнтированном Π½Π° строки
73
+ """
74
+ # БСгмСнтация изобраТСния Π½Π° строки
75
+ line_images = segment_image(image)
76
+
77
+ if not line_images:
78
+ return "НС ΡƒΠ΄Π°Π»ΠΎΡΡŒ ΠΎΠ±Π½Π°Ρ€ΡƒΠΆΠΈΡ‚ΡŒ строки тСкста Π½Π° ΠΈΠ·ΠΎΠ±Ρ€Π°ΠΆΠ΅Π½ΠΈΠΈ."
79
+
80
+ # РаспознаваниС тСкста для ΠΊΠ°ΠΆΠ΄ΠΎΠΉ строки
81
+ recognized_lines = []
82
+
83
+ for line_image in line_images:
84
+ # ΠŸΠΎΠ΄Π³ΠΎΡ‚ΠΎΠ²ΠΊΠ° изобраТСния для ΠΌΠΎΠ΄Π΅Π»ΠΈ
85
+ pixel_values = processor(line_image, return_tensors="pt").pixel_values
86
+ pixel_values = pixel_values.to(device)
87
+
88
+ # РаспознаваниС тСкста
89
+ with torch.no_grad():
90
+ generated_ids = model.generate(
91
+ pixel_values,
92
+ max_length=256,
93
+ num_beams=4,
94
+ early_stopping=True
95
+ )
96
+
97
+ # Π”Π΅ΠΊΠΎΠ΄ΠΈΡ€ΠΎΠ²Π°Π½ΠΈΠ΅ Ρ€Π΅Π·ΡƒΠ»ΡŒΡ‚Π°Ρ‚Π°
98
+ line_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
99
+ recognized_lines.append(line_text)
100
+
101
+ # ОбъСдинСниС всСх строк Π² ΠΎΠ΄ΠΈΠ½ тСкст
102
+ full_text = "\n".join(recognized_lines)
103
+
104
+ return full_text
105
+
106
+ def save_text_to_file(text):
107
+ """
108
+ БохраняСт распознанный тСкст Π² Ρ„Π°ΠΉΠ»
109
+ """
110
+ with open("recognized_text.txt", "w", encoding="utf-8") as f:
111
+ f.write(text)
112
+ return "recognized_text.txt"
113
+
114
+ def process_image(input_image):
115
+ """
116
+ Основная функция для ΠΎΠ±Ρ€Π°Π±ΠΎΡ‚ΠΊΠΈ изобраТСния
117
+ """
118
+ # ΠšΠΎΠ½Π²Π΅Ρ€Ρ‚Π°Ρ†ΠΈΡ Π² PIL Image, Ссли Π½Π΅ΠΎΠ±Ρ…ΠΎΠ΄ΠΈΠΌΠΎ
119
+ if not isinstance(input_image, Image.Image):
120
+ input_image = Image.fromarray(input_image)
121
+
122
+ # РаспознаваниС тСкста
123
+ recognized_text = recognize_text(input_image)
124
+
125
+ # Π‘ΠΎΡ…Ρ€Π°Π½Π΅Π½ΠΈΠ΅ Ρ€Π΅Π·ΡƒΠ»ΡŒΡ‚Π°Ρ‚Π° Π² Ρ„Π°ΠΉΠ»
126
+ output_file = save_text_to_file(recognized_text)
127
+
128
+ return recognized_text, output_file
129
+
130
+ # Π‘ΠΎΠ·Π΄Π°Π½ΠΈΠ΅ интСрфСйса Gradio
131
+ with gr.Blocks(title="РаспознаваниС рукописного тСкста") as demo:
132
+ gr.Markdown("# РаспознаваниС рукописного тСкста")
133
+ gr.Markdown("Π—Π°Π³Ρ€ΡƒΠ·ΠΈΡ‚Π΅ ΠΈΠ·ΠΎΠ±Ρ€Π°ΠΆΠ΅Π½ΠΈΠ΅ с рукописным тСкстом для распознавания.")
134
+
135
+ with gr.Row():
136
+ input_image = gr.Image(type="pil", label="Π˜Π·ΠΎΠ±Ρ€Π°ΠΆΠ΅Π½ΠΈΠ΅")
137
+
138
+ with gr.Row():
139
+ submit_btn = gr.Button("Π Π°ΡΠΏΠΎΠ·Π½Π°Ρ‚ΡŒ тСкст")
140
+
141
+ with gr.Row():
142
+ text_output = gr.Textbox(label="Распознанный тСкст", lines=10)
143
+ file_output = gr.File(label="Π‘ΠΊΠ°Ρ‡Π°Ρ‚ΡŒ тСкстовый Ρ„Π°ΠΉΠ»")
144
+
145
+ submit_btn.click(
146
+ fn=process_image,
147
+ inputs=input_image,
148
+ outputs=[text_output, file_output]
149
+ )
150
+
151
+ if __name__ == "__main__":
152
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ aiohappyeyeballs==2.6.1
3
+ aiohttp==3.11.14
4
+ aiosignal==1.3.2
5
+ annotated-types==0.7.0
6
+ anyio==4.9.0
7
+ async-timeout==5.0.1
8
+ attrs==25.3.0
9
+ cattrs==24.1.2
10
+ certifi==2025.1.31
11
+ charset-normalizer==3.4.1
12
+ click==8.1.8
13
+ coremltools==8.2
14
+ exceptiongroup==1.2.2
15
+ fastapi==0.115.11
16
+ ffmpy==0.5.0
17
+ filelock==3.18.0
18
+ frozenlist==1.5.0
19
+ fsspec==2025.3.0
20
+ gradio==5.22.0
21
+ gradio_client==1.8.0
22
+ groovy==0.1.2
23
+ h11==0.14.0
24
+ httpcore==1.0.7
25
+ httpx==0.28.1
26
+ huggingface-hub==0.29.3
27
+ idna==3.10
28
+ imageio==2.37.0
29
+ importlib_resources==6.5.2
30
+ Jinja2==3.1.6
31
+ joblib==1.4.2
32
+ jsonschema==4.23.0
33
+ jsonschema-specifications==2024.10.1
34
+ kraken==5.3.0
35
+ lazy_loader==0.4
36
+ lightning==2.4.0
37
+ lightning-utilities==0.14.2
38
+ lxml==5.3.1
39
+ markdown-it-py==3.0.0
40
+ MarkupSafe==3.0.2
41
+ mdurl==0.1.2
42
+ mpmath==1.3.0
43
+ multidict==6.2.0
44
+ networkx==3.4.2
45
+ numpy==2.0.2
46
+ nvidia-cublas-cu12==12.1.3.1
47
+ nvidia-cuda-cupti-cu12==12.1.105
48
+ nvidia-cuda-nvrtc-cu12==12.1.105
49
+ nvidia-cuda-runtime-cu12==12.1.105
50
+ nvidia-cudnn-cu12==9.1.0.70
51
+ nvidia-cufft-cu12==11.0.2.54
52
+ nvidia-curand-cu12==10.3.2.106
53
+ nvidia-cusolver-cu12==11.4.5.107
54
+ nvidia-cusparse-cu12==12.1.0.106
55
+ nvidia-nccl-cu12==2.20.5
56
+ nvidia-nvjitlink-cu12==12.8.93
57
+ nvidia-nvtx-cu12==12.1.105
58
+ opencv-python==4.11.0.86
59
+ orjson==3.10.15
60
+ packaging==24.2
61
+ pandas==2.2.3
62
+ pillow==11.1.0
63
+ propcache==0.3.0
64
+ protobuf==6.30.1
65
+ pyaml==25.1.0
66
+ pyarrow==19.0.1
67
+ pydantic==2.10.6
68
+ pydantic_core==2.27.2
69
+ pydub==0.25.1
70
+ Pygments==2.19.1
71
+ python-bidi==0.6.6
72
+ python-dateutil==2.9.0.post0
73
+ python-multipart==0.0.20
74
+ pytorch-lightning==2.5.1
75
+ pytz==2025.1
76
+ PyYAML==6.0.2
77
+ referencing==0.36.2
78
+ regex==2024.11.6
79
+ requests==2.32.3
80
+ rich==13.9.4
81
+ rpds-py==0.23.1
82
+ ruff==0.11.2
83
+ safehttpx==0.1.6
84
+ safetensors==0.5.3
85
+ scikit-image==0.24.0
86
+ scikit-learn==1.5.2
87
+ scipy==1.13.1
88
+ semantic-version==2.10.0
89
+ shapely==2.0.7
90
+ shellingham==1.5.4
91
+ six==1.17.0
92
+ sniffio==1.3.1
93
+ starlette==0.46.1
94
+ sympy==1.13.3
95
+ threadpoolctl==3.5.0
96
+ tifffile==2025.3.13
97
+ tokenizers==0.20.3
98
+ tomlkit==0.13.2
99
+ torch==2.4.1
100
+ torchmetrics==1.7.0
101
+ torchvision==0.19.1
102
+ tqdm==4.67.1
103
+ transformers==4.45.0
104
+ triton==3.0.0
105
+ typer==0.15.2
106
+ typing_extensions==4.12.2
107
+ tzdata==2025.2
108
+ urllib3==2.3.0
109
+ uvicorn==0.34.0
110
+ websockets==15.0.1
111
+ yarl==1.18.3