File size: 4,361 Bytes
6af176b
2c83941
6af176b
c873d13
 
 
 
5c898fe
6af176b
c873d13
6af176b
 
c873d13
 
 
 
 
 
 
 
 
 
 
2c83941
 
 
 
 
 
 
5c898fe
 
 
 
 
 
 
 
 
 
 
 
 
 
c873d13
 
 
 
 
 
5c898fe
 
 
 
c873d13
6af176b
5c898fe
c873d13
 
 
 
 
6af176b
2324254
c873d13
 
 
 
 
 
 
 
 
 
 
 
 
 
2324254
5c898fe
c873d13
 
 
5c898fe
c873d13
 
 
 
 
5c898fe
c873d13
 
2324254
 
 
5c898fe
2324254
 
5c898fe
2324254
5c898fe
2324254
 
5c898fe
2324254
 
5c898fe
2324254
5c898fe
16e02f2
c873d13
 
2324254
 
6af176b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import gradio as gr
import trafilatura
from transformers import pipeline
import pytesseract
from PIL import Image
import requests
from io import BytesIO
import difflib

# Load model
reviewer = pipeline("text2text-generation", model="google/flan-t5-base")

# OCR
def extract_text_from_image_url(img_url):
    try:
        response = requests.get(img_url)
        img = Image.open(BytesIO(response.content))
        text = pytesseract.image_to_string(img)
        return text
    except Exception as e:
        return f"❌ OCR Error: {e}"

# Extract blog
def extract_text_from_url(url): 
    downloaded = trafilatura.fetch_url(url)
    if downloaded:
        return trafilatura.extract(downloaded)
    else:
        return "❌ Blog Error: Could not fetch content from the URL."

# Highlight differences using difflib
def highlight_diffs(original, suggestion):
    diff = difflib.ndiff(original.split(), suggestion.split())
    result = []
    for token in diff:
        if token.startswith("- "):
            result.append(f"~~{token[2:]}~~")
        elif token.startswith("+ "):
            result.append(f"**{token[2:]}**")
        elif token.startswith("  "):
            result.append(token[2:])
    return " ".join(result)

# Review lines with diffs
def review_lines(text):
    lines = text.strip().split('\n')
    suggestions = []
    for line in lines:
        if line.strip() == "":
            continue
        prompt = f"Rewrite this to fix grammar, tone, and remove any offensive language:\n\n{line}"
        suggestion = reviewer(prompt, max_new_tokens=100)[0]['generated_text'].strip()
        highlighted = highlight_diffs(line.strip(), suggestion)
        suggestions.append((line, highlighted, suggestion))
    return suggestions

# Finalize accepted suggestions
def finalize_text(originals, suggestions, decisions):
    final = []
    for orig, sugg, keep in zip(originals, suggestions, decisions):
        final.append(sugg if keep else orig)
    return "\n".join(final)

# Gradio app
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as app:
    gr.Markdown("## ✨ BlogChecker AI\nSmart AI reviewer for blog content, with interactive approval and OCR image support.")

    with gr.Row():
        blog_url = gr.Textbox(label="πŸ“Ž Blog URL")
        image_url = gr.Textbox(label="πŸ–ΌοΈ Image URL (optional)")
        extract_btn = gr.Button("πŸ” Extract")

    combined_text = gr.Textbox(label="πŸ“ Combined Blog + OCR Text", lines=10)

    with gr.Row():
        review_btn = gr.Button("🧠 Review Content")
        finalize_btn = gr.Button("βœ… Finalize Clean Blog")

    review_section = gr.Column(visible=False)
    review_boxes = []  # Will store tuples: (original_box, highlighted_markdown_box, accept_checkbox, clean_suggestion)

    final_output = gr.Textbox(label="πŸ“¦ Final Clean Blog", lines=10)

    # Text extraction logic
    def extract_both(url, img_url):
        blog = extract_text_from_url(url)
        ocr = extract_text_from_image_url(img_url) if img_url else ""
        return blog + ("\n" + ocr if ocr else "")

    # Review processing with diffs
    def process_review(text):
        results = review_lines(text)
        review_section.children.clear()
        review_boxes.clear()

        for i, (orig, highlighted, clean_sugg) in enumerate(results):
            with review_section:
                orig_box = gr.Textbox(value=orig, label=f"Original Line {i+1}", interactive=False)
                markdown_sugg = gr.Markdown(value=highlighted, label=f"Suggested Edit {i+1}")
                accept_box = gr.Checkbox(label="βœ… Accept Suggestion", value=False)
                review_boxes.append((orig_box, markdown_sugg, accept_box, clean_sugg))
        return gr.update(visible=True)

    # Finalization logic
    def collect_dynamic_decisions():
        originals_vals = [box[0].value for box in review_boxes]
        clean_suggestions = [box[3] for box in review_boxes]
        accepts_vals = [box[2].value for box in review_boxes]
        return finalize_text(originals_vals, clean_suggestions, accepts_vals)

    # Wire actions
    extract_btn.click(fn=extract_both, inputs=[blog_url, image_url], outputs=combined_text)
    review_btn.click(fn=process_review, inputs=combined_text, outputs=review_section)
    finalize_btn.click(fn=collect_dynamic_decisions, outputs=final_output)

app.launch()