File size: 2,229 Bytes
8d2615a
5866ba2
 
c8359bd
 
5866ba2
8d2615a
 
5866ba2
c8359bd
 
5866ba2
 
 
8d2615a
5866ba2
90d9abf
 
c8359bd
 
 
5866ba2
8d2615a
5866ba2
90d9abf
c8359bd
 
 
 
 
 
 
90d9abf
5866ba2
8d2615a
c8359bd
 
5866ba2
 
 
 
c8359bd
5866ba2
 
 
 
c8359bd
 
8d2615a
67f1091
5866ba2
 
 
 
67f1091
5866ba2
 
 
 
67f1091
5866ba2
c8359bd
5866ba2
 
 
90d9abf
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# import part
import streamlit as st
from transformers import pipeline
from gtts import gTTS
import io

# function part
# img2text
def img2text(url):
    image_to_text_model = pipeline("image-to-text",
                                   model="Salesforce/blip-image-captioning-base")
    text = image_to_text_model(url)[0]["generated_text"]
    return text

# text2story
def text2story(text):
    # 使用 Hugging Face 的 text-generation 模型生成故事
    story_pipeline = pipeline("text-generation", model="agentica-org/DeepScaleR-1.5B-Preview")
    result = story_pipeline(text, max_length=200, num_return_sequences=1)
    story_text = result[0]['generated_text']
    return story_text

# text2audio
def text2audio(story_text):
    # 使用 gTTS 将文本转换为音频
    tts = gTTS(text=story_text, lang='en')
    # 创建一个内存中的字节流对象,用于存储音频数据
    audio_file = io.BytesIO()
    # 将音频数据写入字节流
    tts.write_to_fp(audio_file)
    # 将文件指针移动到文件开头,以便后续读取
    audio_file.seek(0)
    return {'audio': audio_file, 'sampling_rate': 16000}  # 返回音频数据和采样率

# main part
st.set_page_config(page_title="Your Image to Audio Story",
                   page_icon="🦜")
st.header("Turn Your Image to Audio Story")
uploaded_file = st.file_uploader("Select an Image...")

if uploaded_file is not None:
    print(uploaded_file)
    bytes_data = uploaded_file.getvalue()
    with open(uploaded_file.name, "wb") as file:
        file.write(bytes_data)

    st.image(uploaded_file, caption="Uploaded Image",
             use_column_width=True)

    # Stage 1: Image to Text
    st.text('Processing img2text...')
    scenario = img2text(uploaded_file.name)
    st.write(scenario)

    # Stage 2: Text to Story
    st.text('Generating a story...')
    story = text2story(scenario)
    st.write(story)

    # Stage 3: Story to Audio data
    st.text('Generating audio data...')
    audio_data = text2audio(story)

    # Play button
    if st.button("Play Audio"):
        st.audio(audio_data['audio'],
                 format="audio/wav",
                 start_time=0,
                 sample_rate=audio_data['sampling_rate'])