AMR-KELEG commited on
Commit
c0a63e1
·
1 Parent(s): 3563942

Show preprocessing options

Browse files
Files changed (1) hide show
  1. app.py +25 -12
app.py CHANGED
@@ -12,24 +12,29 @@ import base64
12
  import re
13
 
14
 
15
- def preprocess_text(arabic_text):
16
  """Apply preprocessing to the given Arabic text.
17
 
18
  Args:
19
  arabic_text: The Arabic text to be preprocessed.
 
 
20
 
21
  Returns:
22
  The preprocessed Arabic text.
23
  """
24
- no_urls = re.sub(
25
- r"(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b",
26
- "",
27
- arabic_text,
28
- flags=re.MULTILINE,
29
- )
30
- no_english = re.sub(r"[a-zA-Z]", "", no_urls)
 
 
 
31
 
32
- return no_english
33
 
34
 
35
  @st.cache_data
@@ -57,7 +62,7 @@ tokenizer = AutoTokenizer.from_pretrained(constants.MODEL_NAME)
57
  model = load_model(constants.MODEL_NAME)
58
 
59
 
60
- def compute_ALDi(sentences):
61
  """Computes the ALDi score for the given sentences.
62
 
63
  Args:
@@ -72,7 +77,9 @@ def compute_ALDi(sentences):
72
  BATCH_SIZE = 4
73
  output_logits = []
74
 
75
- preprocessed_sentences = [preprocess_text(s) for s in sentences]
 
 
76
 
77
  for first_index in range(0, len(preprocessed_sentences), BATCH_SIZE):
78
  inputs = tokenizer(
@@ -101,6 +108,7 @@ def render_metadata():
101
  c = st.container()
102
  c.write(html, unsafe_allow_html=True)
103
 
 
104
  render_svg(open("assets/ALDi_logo.svg").read())
105
  render_metadata()
106
 
@@ -114,8 +122,13 @@ with tab1:
114
  # TODO: Check if this is needed!
115
  clicked = st.button("Submit")
116
 
 
 
 
117
  if sent:
118
- ALDi_score = compute_ALDi([sent])[0]
 
 
119
 
120
  ORANGE_COLOR = "#FF8000"
121
  fig, ax = plt.subplots(figsize=(8, 1))
 
12
  import re
13
 
14
 
15
+ def preprocess_text(arabic_text, remove_urls, remove_latin):
16
  """Apply preprocessing to the given Arabic text.
17
 
18
  Args:
19
  arabic_text: The Arabic text to be preprocessed.
20
+ remove_urls: Boolean indicating whether to remove URLs.
21
+ remove_latin: Boolean indicating whether to remove Latin characters.
22
 
23
  Returns:
24
  The preprocessed Arabic text.
25
  """
26
+ if remove_urls:
27
+ arabic_text = re.sub(
28
+ r"(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b",
29
+ "",
30
+ arabic_text,
31
+ flags=re.MULTILINE,
32
+ )
33
+
34
+ if remove_latin:
35
+ arabic_text = re.sub(r"[a-zA-Z]", "", arabic_text)
36
 
37
+ return arabic_text
38
 
39
 
40
  @st.cache_data
 
62
  model = load_model(constants.MODEL_NAME)
63
 
64
 
65
+ def compute_ALDi(sentences, remove_urls=True, remove_latin=True):
66
  """Computes the ALDi score for the given sentences.
67
 
68
  Args:
 
77
  BATCH_SIZE = 4
78
  output_logits = []
79
 
80
+ preprocessed_sentences = [
81
+ preprocess_text(s, remove_urls, remove_latin) for s in sentences
82
+ ]
83
 
84
  for first_index in range(0, len(preprocessed_sentences), BATCH_SIZE):
85
  inputs = tokenizer(
 
108
  c = st.container()
109
  c.write(html, unsafe_allow_html=True)
110
 
111
+
112
  render_svg(open("assets/ALDi_logo.svg").read())
113
  render_metadata()
114
 
 
122
  # TODO: Check if this is needed!
123
  clicked = st.button("Submit")
124
 
125
+ remove_urls = st.toggle("Remove urls", value=True)
126
+ remove_latin = st.toggle("Remove Latin characters", value=True)
127
+
128
  if sent:
129
+ ALDi_score = compute_ALDi(
130
+ [sent], remove_urls=remove_urls, remove_latin=remove_latin
131
+ )[0]
132
 
133
  ORANGE_COLOR = "#FF8000"
134
  fig, ax = plt.subplots(figsize=(8, 1))