Spaces:
Sleeping
Sleeping
File size: 4,100 Bytes
6af176b 2c83941 6af176b c873d13 5c898fe 6af176b c873d13 6af176b b63764b c873d13 b63764b 2c83941 b63764b 5c898fe b63764b 5c898fe b63764b c873d13 b63764b c873d13 b63764b c873d13 b63764b 6af176b 5c898fe c873d13 b63764b 6af176b b63764b c873d13 b63764b c873d13 b63764b c873d13 b63764b c873d13 b63764b 2324254 b63764b 2324254 b63764b 16e02f2 b63764b c873d13 b63764b 6af176b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
import gradio as gr
import trafilatura
from transformers import pipeline
import pytesseract
from PIL import Image
import requests
from io import BytesIO
import difflib
# Load model
reviewer = pipeline("text2text-generation", model="google/flan-t5-base")
# OCR from image URL
def extract_text_from_image_url(img_url):
try:
response = requests.get(img_url)
img = Image.open(BytesIO(response.content))
text = pytesseract.image_to_string(img)
return text
except Exception as e:
return f"β OCR Error: {e}"
# Extract main blog content from URL
def extract_text_from_url(url):
downloaded = trafilatura.fetch_url(url)
if downloaded:
return trafilatura.extract(downloaded)
else:
return "β Blog Error: Could not fetch content from the URL."
# Highlight diffs using difflib
def highlight_diffs(orig, suggestion):
diff = difflib.ndiff(orig.split(), suggestion.split())
result = []
for word in diff:
if word.startswith('- '):
result.append(f"~~{word[2:]}~~")
elif word.startswith('+ '):
result.append(f"**{word[2:]}**")
elif word.startswith(' '):
result.append(word[2:])
return " ".join(result)
# Review line-by-line
def review_lines(text):
lines = text.strip().split('\n')
reviewed = []
for line in lines:
if not line.strip():
continue
prompt = f"Fix grammar, tone, and clarity:\n\n{line}"
response = reviewer(prompt, max_new_tokens=100)[0]['generated_text'].strip()
highlighted = highlight_diffs(line.strip(), response)
reviewed.append((line.strip(), highlighted, response))
return reviewed
# Finalize accepted suggestions
def finalize_text(originals, suggestions, decisions):
output = []
for orig, sugg, accepted in zip(originals, suggestions, decisions):
output.append(sugg if accepted else orig)
return "\n".join(output)
# Build Gradio UI
with gr.Blocks(theme=gr.themes.Soft()) as app:
gr.Markdown("## β¨ BlogChecker AI\nSmart blog reviewer with OCR + AI suggestions")
with gr.Row():
blog_url = gr.Textbox(label="π Blog URL")
image_url = gr.Textbox(label="πΌοΈ Image URL (optional)")
extract_btn = gr.Button("π Extract")
combined_text = gr.Textbox(label="π Combined Blog + OCR Text", lines=10)
with gr.Row():
review_btn = gr.Button("π§ Review Content")
finalize_btn = gr.Button("β
Finalize Clean Blog")
review_area = gr.Column(visible=False)
final_output = gr.Textbox(label="π¦ Final Clean Blog", lines=10)
review_boxes = []
# Extract combined content
def extract_both(url, img_url):
blog = extract_text_from_url(url)
ocr = extract_text_from_image_url(img_url) if img_url else ""
return blog + ("\n" + ocr if ocr else "")
# Review and build UI dynamically
def do_review(text):
results = review_lines(text)
review_area.children.clear()
review_boxes.clear()
for idx, (orig, highlighted, clean) in enumerate(results):
with review_area:
orig_box = gr.Textbox(value=orig, label=f"Original Line {idx+1}", interactive=False)
markdown_sugg = gr.Markdown(value=highlighted, label=f"Suggested Edit {idx+1}")
accept = gr.Checkbox(label="β
Accept Suggestion", value=False)
review_boxes.append((orig_box, clean, accept))
return gr.update(visible=True)
# Compile final clean version
def collect_results():
originals = [box[0].value for box in review_boxes]
suggestions = [box[1] for box in review_boxes]
accepts = [box[2].value for box in review_boxes]
return finalize_text(originals, suggestions, accepts)
# Wire buttons
extract_btn.click(fn=extract_both, inputs=[blog_url, image_url], outputs=combined_text)
review_btn.click(fn=do_review, inputs=combined_text, outputs=review_area)
finalize_btn.click(fn=collect_results, outputs=final_output)
app.launch()
|