T Le commited on
Commit
cd8b282
·
1 Parent(s): d054553

Topic modeling model update

Browse files
pages/2 Topic Modeling.py CHANGED
@@ -196,7 +196,7 @@ if uploaded_file is not None:
196
  method = c1.selectbox(
197
  'Choose method',
198
  ('Choose...', 'pyLDA', 'Biterm', 'BERTopic'))
199
- ColCho = c2.selectbox('Choose column', (coldf))
200
  num_cho = c3.number_input('Choose number of topics', min_value=2, max_value=30, value=5)
201
 
202
  d1, d2 = st.columns([3,7])
@@ -235,8 +235,8 @@ if uploaded_file is not None:
235
  if fine_tuning:
236
  topic_labelling = st.toggle("Automatic topic labelling")
237
  if topic_labelling:
238
- llm_provider = st.selectbox("Provider",["OpenAI","HuggingFace"])
239
- if llm_provider == "OpenAI":
240
  api_key = st.text_input("API Key")
241
 
242
  else:
@@ -527,21 +527,36 @@ if uploaded_file is not None:
527
  "MMR": mmr,
528
  }
529
  if topic_labelling:
530
- if llm_provider == "OpenAI":
531
  client = openai.OpenAI(api_key=api_key)
532
  representation_model = {
533
  "KeyBERT": keybert,
534
  "MMR": mmr,
535
  "test": OpenAI(client, model = "gpt-4o-mini", delay_in_seconds=10)
536
  }
537
- elif llm_provider == "HuggingFace":
538
- gennie = pipeline("text2text-generation", model = "google/flan-t5-base")
539
- clientmod = TextGeneration(gennie)
540
  representation_model = {
541
  "KeyBERT": keybert,
542
  "MMR": mmr,
543
  "test": clientmod
544
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
545
 
546
  vectorizer_model = CountVectorizer(ngram_range=(1, xgram), stop_words='english')
547
  topic_model = BERTopic(representation_model = representation_model, embedding_model=model, hdbscan_model=cluster_model, language=lang, umap_model=umap_model, vectorizer_model=vectorizer_model, top_n_words=bert_top_n_words)
 
196
  method = c1.selectbox(
197
  'Choose method',
198
  ('Choose...', 'pyLDA', 'Biterm', 'BERTopic'))
199
+ ColCho = c2.selectbox('Choose column', (["Title","Abstract"]))
200
  num_cho = c3.number_input('Choose number of topics', min_value=2, max_value=30, value=5)
201
 
202
  d1, d2 = st.columns([3,7])
 
235
  if fine_tuning:
236
  topic_labelling = st.toggle("Automatic topic labelling")
237
  if topic_labelling:
238
+ llm_model = st.selectbox("Model",["OpenAI/gpt-4o","Google/Flan-t5","OpenAI/gpt-oss"])
239
+ if llm_model == "OpenAI/gpt-4o":
240
  api_key = st.text_input("API Key")
241
 
242
  else:
 
527
  "MMR": mmr,
528
  }
529
  if topic_labelling:
530
+ if llm_model == "OpenAI/gpt-4o":
531
  client = openai.OpenAI(api_key=api_key)
532
  representation_model = {
533
  "KeyBERT": keybert,
534
  "MMR": mmr,
535
  "test": OpenAI(client, model = "gpt-4o-mini", delay_in_seconds=10)
536
  }
537
+ elif llm_model == "Google/Flan-t5":
538
+ gen = pipeline("text2text-generation", model = "google/flan-t5-base")
539
+ clientmod = TextGeneration(gen)
540
  representation_model = {
541
  "KeyBERT": keybert,
542
  "MMR": mmr,
543
  "test": clientmod
544
  }
545
+ elif llm_model == "OpenAI/gpt-oss":
546
+ gen = pipeline("text-generation",
547
+ model = "openai/gpt-oss-20b",
548
+ torch_dtype = "auto",
549
+ device_map = "auto",
550
+ )
551
+ clientmod = TextGeneration(gen)
552
+
553
+ representation_model = {
554
+ "KeyBERT": keybert,
555
+ "MMR": mmr,
556
+ "test": gen
557
+ }
558
+
559
+
560
 
561
  vectorizer_model = CountVectorizer(ngram_range=(1, xgram), stop_words='english')
562
  topic_model = BERTopic(representation_model = representation_model, embedding_model=model, hdbscan_model=cluster_model, language=lang, umap_model=umap_model, vectorizer_model=vectorizer_model, top_n_words=bert_top_n_words)
tools/__pycache__/sourceformat.cpython-310.pyc DELETED
Binary file (5.74 kB)