KevSun commited on
Commit
18d2947
verified
1 Parent(s): 42258c6

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -0
app.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from sentence_transformers import SentenceTransformer, util
3
+ from sklearn.decomposition import LatentDirichletAllocation
4
+ from sklearn.feature_extraction.text import CountVectorizer
5
+ from sklearn.manifold import TSNE
6
+ from langdetect import detect, DetectorFactory
7
+ import numpy as np
8
+ import matplotlib.pyplot as plt
9
+ import pandas as pd
10
+ import torch
11
+
12
+ st.set_page_config(page_title="Multilingual Text Analysis System", layout="wide")
13
+
14
+ @st.cache_resource
15
+ def load_model():
16
+ return SentenceTransformer('distiluse-base-multilingual-cased-v1')
17
+
18
+ DetectorFactory.seed = 0
19
+ multi_embedding_model = load_model()
20
+
21
+ class WordEmbeddingAgent:
22
+ def __init__(self, model):
23
+ self.model = model
24
+
25
+ def get_embeddings(self, words):
26
+ return self.model.encode(words)
27
+
28
+ class SimilarityAgent:
29
+ def __init__(self, model):
30
+ self.model = model
31
+
32
+ def compute_similarity(self, text1, text2):
33
+ embedding1 = self.model.encode(text1, convert_to_tensor=True)
34
+ embedding2 = self.model.encode(text2, convert_to_tensor=True)
35
+ return util.pytorch_cos_sim(embedding1, embedding2).item()
36
+
37
+ class TopicModelingAgent:
38
+ def __init__(self, n_components=5):
39
+ self.lda_model = LatentDirichletAllocation(n_components=n_components, random_state=42)
40
+
41
+ def fit_transform(self, texts, lang):
42
+ stop_words = 'english' if lang == 'en' else None
43
+ vectorizer = CountVectorizer(max_df=0.9, min_df=2, stop_words=stop_words)
44
+ dtm = vectorizer.fit_transform(texts)
45
+ self.lda_model.fit(dtm)
46
+ return self.lda_model.transform(dtm), vectorizer
47
+
48
+ def get_topics(self, vectorizer, num_words=5):
49
+ topics = {}
50
+ for idx, topic in enumerate(self.lda_model.components_):
51
+ topics[idx] = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-num_words:]]
52
+ return topics
53
+
54
+ def detect_language(text):
55
+ try:
56
+ return detect(text)
57
+ except:
58
+ return "unknown"
59
+
60
+ @st.cache_data
61
+ def tsne_visualization(embeddings, words):
62
+ tsne = TSNE(n_components=2, random_state=42)
63
+ embeddings_2d = tsne.fit_transform(embeddings)
64
+ df = pd.DataFrame(embeddings_2d, columns=['x', 'y'])
65
+ df['word'] = words
66
+ return df
67
+
68
+ st.title("Multilingual Text Analysis System")
69
+ user_input = st.text_area("Enter your text here:")
70
+
71
+ if st.button("Analyze") or user_input:
72
+ if user_input:
73
+ lang = detect_language(user_input)
74
+ st.write(f"Detected language: {lang}")
75
+
76
+ embedding_agent = WordEmbeddingAgent(multi_embedding_model)
77
+ similarity_agent = SimilarityAgent(multi_embedding_model)
78
+ topic_modeling_agent = TopicModelingAgent()
79
+
80
+ words = user_input.split()
81
+
82
+ with st.spinner("Generating word embeddings..."):
83
+ embeddings = embedding_agent.get_embeddings(words)
84
+ st.success("Word Embeddings Generated.")
85
+
86
+ with st.spinner("Creating t-SNE visualization..."):
87
+ tsne_df = tsne_visualization(embeddings, words)
88
+ fig, ax = plt.subplots()
89
+ ax.scatter(tsne_df['x'], tsne_df['y'])
90
+ for i, word in enumerate(tsne_df['word']):
91
+ ax.annotate(word, (tsne_df['x'][i], tsne_df['y'][i]))
92
+ st.pyplot(fig)
93
+
94
+ with st.spinner("Extracting topics..."):
95
+ texts = [user_input, "Another text to improve topic modeling."]
96
+ topic_distr, vectorizer = topic_modeling_agent.fit_transform(texts, lang)
97
+ topics = topic_modeling_agent.get_topics(vectorizer)
98
+ st.subheader("Topics Extracted:")
99
+ for topic, words in topics.items():
100
+ st.write(f"Topic {topic}: {', '.join(words)}")
101
+
102
+ with st.spinner("Computing similarity..."):
103
+ text2 = "Otro texto de ejemplo para comparaci贸n de similitud." if lang != 'en' else "Another example text for similarity comparison."
104
+ similarity_score = similarity_agent.compute_similarity(user_input, text2)
105
+ st.write(f"Similarity Score with example text: {similarity_score:.4f}")
106
+
107
+ else:
108
+ st.warning("Please enter some text to analyze.")
109
+
110
+ st.sidebar.title("About")
111
+ st.sidebar.info("This app performs multilingual text analysis using various NLP techniques.")
112
+