File size: 2,517 Bytes
81446ef
 
 
bce5fb4
 
81446ef
 
bfa64b6
 
 
81446ef
bfa64b6
b59b230
cfbeebd
81446ef
7ea96d2
ae44a4c
7057a87
4570d11
 
b9c3b16
81446ef
eace371
81446ef
b9c3b16
81446ef
b9c3b16
 
 
 
 
 
81446ef
b9c3b16
 
 
 
eda711d
 
 
f26be33
b59b230
 
 
2a14cb0
 
 
b59b230
ce0fb45
2a14cb0
 
 
eda711d
b9b0b35
eda711d
 
324fe53
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import pandas as pd
import streamlit as st
from keybert import KeyBERT
import yake
from keyphrase_vectorizers import KeyphraseCountVectorizer

@st.cache(allow_output_mutation=True, suppress_st_warning=True, show_spinner=True)
def load_model():
  model = KeyBERT("AI-Growth-Lab/PatentSBERTa")
  return model
  
model = load_model()
   
st.title("Patent Text Extractor")
placeholder = st.empty()
text = placeholder.text_area("Paste or write text", height=300)
button = st.button("Extract Keywords")
#top_n = st.sidebar.slider("Select a number of keywords", 1, 10, 50,20)
#min_ngram = st.sidebar.number_input("Minimum number of words in each keyword", 1)
#max_ngram = st.sidebar.number_input("Maximum number of words in each keyword", 3)
#st.sidebar.code(f"ngram_range=({min_ngram}, {max_ngram})")

#params = {"docs": text_input, "top_n": top_n, "stop_words": 'english',"vectorizer":KeyphraseCountVectorizer()}

#add_diversity = st.sidebar.checkbox("Adjust diversity of keywords")

#if add_diversity:
  #method = st.sidebar.selectbox("Select a method", ("Max Sum Similarity", "Maximal Marginal Relevance"))
  #if method == "Max Sum Similarity":
        #nr_candidates = st.sidebar.slider("nr_candidates", 20, 50, 20, 2)
        #params["use_maxsum"] = True
        #params["nr_candidates"] = nr_candidates

  #elif method == "Maximal Marginal Relevance":
        #diversity = st.sidebar.slider("diversity", 0.1, 1.0, 0.6, 0.01)
        #params["use_mmr"] = True
        #params["diversity"] = diversity
#kw_extractor = yake.KeywordExtractor(top=50)
#candidates = kw_extractor.extract_keywords(text_input)
#keyphrases = [candidate[0] for candidate in candidates]
#kw_model = KeyBERT(model="google/bigbird-pegasus-large-bigpatent")
from keybert import KeyBERT
kw_model = KeyBERT(model='AI-Growth-Lab/PatentSBERTa')
kw_extractor = yake.KeywordExtractor(top=50)
candidates = kw_extractor.extract_keywords(text)
candidates = [candidate[0] for candidate in candidates]
from keyphrase_vectorizers import KeyphraseCountVectorizer
keywords=kw_model.extract_keywords(text,candidates, keyphrase_ngram_range=(1, 3), 
  top_n=50,stop_words='english')


#keywords=predict_fn(text, model)

#if keywords != []:
#keywords = model.extract_keywords(text_input,keyphrases, keyphrase_ngram_range=(1, 3), 
                       #top_n=50,stop_words='english',vectorizer=KeyphraseCountVectorizer())
if keywords != []:
    st.info("Extracted keywords")
    keywords = pd.DataFrame(keywords, columns=["Keyword", "Score"])
    st.table(keywords)