eacortes commited on
Commit
305e7b1
·
1 Parent(s): cbc0739

push new db or demo version

Browse files
dump.rdb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:edb6dac953afd60f97f39ca1a77dfb3cc323e573cbfd6b6ed768d6d637613898
3
- size 2454945
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d61b5bfb2340158a7febe75d096dbab4a1ab5a041d1f094cd18b34063fab3d2b
3
+ size 89804397
src/app.py CHANGED
@@ -44,7 +44,7 @@ class App:
44
  def _draw_molecule_grid(cls, similar: list[SimilarMolecule]) -> np.ndarray:
45
  mols = [Chem.MolFromSmiles(m["smiles"]) for m in similar]
46
  legends = [
47
- f"{cls._truncated_attribute(m, 'name')}\n{m['category']}\n"
48
  f"{cls._truncated_attribute(m, 'smiles')}\n{m['score']:.2E}"
49
  for m in similar
50
  ]
@@ -57,7 +57,7 @@ class App:
57
  img = Draw.MolsToGridImage(
58
  mols,
59
  legends=legends,
60
- molsPerRow=2,
61
  subImgSize=(250, 250),
62
  drawOptions=draw_options,
63
  )
@@ -67,7 +67,9 @@ class App:
67
  def _display_sample_molecules(mols: pd.DataFrame):
68
  for _, row in mols.iterrows():
69
  with gr.Group():
70
- gr.Textbox(value=row["smiles"], label=f"{row['name']} ({row['category']})", interactive=False, scale=3)
 
 
71
  sample_btn = gr.Button(
72
  f"Load {row['name']}",
73
  scale=1,
@@ -81,7 +83,7 @@ class App:
81
 
82
  @staticmethod
83
  def clear_all():
84
- return "", [], [], None, "Cleared - Draw a new molecule or enter SMILES"
85
 
86
  def handle_search(self, smiles: str, embed_dim: int):
87
  if not smiles.strip():
@@ -119,7 +121,7 @@ class App:
119
  """)
120
  gr.HTML(
121
  """
122
- The Redis database indexes a curated subset of molecules from <a href="https://isomerdesign.com/pihkal/home">Isomer Design</a>
123
  <a href="https://creativecommons.org/licenses/by-nc-sa/4.0/">
124
  <img src="https://mirrors.creativecommons.org/presskit/buttons/80x15/svg/by-nc-sa.svg" alt="License: CC BY-NC-SA 4.0"
125
  style="display:inline; height:15px; vertical-align:middle; margin-left:4px;"/>
@@ -141,6 +143,13 @@ class App:
141
  elem_id="smiles_input",
142
  )
143
 
 
 
 
 
 
 
 
144
  embedding_dimension = gr.Dropdown(
145
  choices=SUPPORTED_EMBEDDING_DIMENSIONS,
146
  value=EMBEDDING_DIMENSION,
@@ -193,6 +202,14 @@ class App:
193
  with gr.Column(scale=1):
194
  self._display_sample_molecules(SAMPLE_SMILES[2::3])
195
 
 
 
 
 
 
 
 
 
196
  search_btn.click(
197
  fn=self.handle_search,
198
  inputs=[smiles_input, embedding_dimension],
@@ -211,6 +228,7 @@ class App:
211
  js="window.clearJSME",
212
  outputs=[
213
  smiles_input,
 
214
  embedding_output,
215
  similar_molecules_output,
216
  molecule_image,
 
44
  def _draw_molecule_grid(cls, similar: list[SimilarMolecule]) -> np.ndarray:
45
  mols = [Chem.MolFromSmiles(m["smiles"]) for m in similar]
46
  legends = [
47
+ f"{cls._truncated_attribute(m, 'name')}\n{m['properties']}\n"
48
  f"{cls._truncated_attribute(m, 'smiles')}\n{m['score']:.2E}"
49
  for m in similar
50
  ]
 
57
  img = Draw.MolsToGridImage(
58
  mols,
59
  legends=legends,
60
+ molsPerRow=3,
61
  subImgSize=(250, 250),
62
  drawOptions=draw_options,
63
  )
 
67
  def _display_sample_molecules(mols: pd.DataFrame):
68
  for _, row in mols.iterrows():
69
  with gr.Group():
70
+ gr.Textbox(
71
+ value=row["smiles"], label=f"{row['name']} ({row['properties']})", interactive=False, scale=3
72
+ )
73
  sample_btn = gr.Button(
74
  f"Load {row['name']}",
75
  scale=1,
 
83
 
84
  @staticmethod
85
  def clear_all():
86
+ return "", "", [], [], None, "Cleared - Draw a new molecule or enter SMILES"
87
 
88
  def handle_search(self, smiles: str, embed_dim: int):
89
  if not smiles.strip():
 
121
  """)
122
  gr.HTML(
123
  """
124
+ The Redis database indexes <a href="https://isomerdesign.com/pihkal/home">Isomer Design</a> molecular library.
125
  <a href="https://creativecommons.org/licenses/by-nc-sa/4.0/">
126
  <img src="https://mirrors.creativecommons.org/presskit/buttons/80x15/svg/by-nc-sa.svg" alt="License: CC BY-NC-SA 4.0"
127
  style="display:inline; height:15px; vertical-align:middle; margin-left:4px;"/>
 
143
  elem_id="smiles_input",
144
  )
145
 
146
+ canonical_smiles_output = gr.Textbox(
147
+ label="Canonical SMILES",
148
+ placeholder="Canonical representation will appear here",
149
+ interactive=False,
150
+ elem_id="canonical_smiles_output",
151
+ )
152
+
153
  embedding_dimension = gr.Dropdown(
154
  choices=SUPPORTED_EMBEDDING_DIMENSIONS,
155
  value=EMBEDDING_DIMENSION,
 
202
  with gr.Column(scale=1):
203
  self._display_sample_molecules(SAMPLE_SMILES[2::3])
204
 
205
+ # Update canonical SMILES when input changes
206
+ smiles_input.change(
207
+ fn=self.embedding_service.get_canonical_smiles,
208
+ inputs=[smiles_input],
209
+ outputs=[canonical_smiles_output],
210
+ api_name="get_canonical_smiles",
211
+ )
212
+
213
  search_btn.click(
214
  fn=self.handle_search,
215
  inputs=[smiles_input, embedding_dimension],
 
228
  js="window.clearJSME",
229
  outputs=[
230
  smiles_input,
231
+ canonical_smiles_output,
232
  embedding_output,
233
  similar_molecules_output,
234
  molecule_image,
src/constants.py CHANGED
@@ -1,25 +1,25 @@
1
  # Model config
2
  MODEL_NAME = "Derify/ChemMRL-alpha"
3
- SUPPORTED_EMBEDDING_DIMENSIONS = [1024, 768, 512, 256, 128, 64, 32, 16, 8, 4, 2]
4
  EMBEDDING_DIMENSION = max(SUPPORTED_EMBEDDING_DIMENSIONS)
5
  USE_HALF_PRECISION = True
6
 
7
  # HNSW index parameters
8
- HNSW_K = 6
9
  HNSW_PARAMETERS = {
10
  # Embedding vector dtype
11
  "TYPE": "FLOAT16" if USE_HALF_PRECISION else "FLOAT32",
12
  # Embedding vectors are normalized so COSINE and IP are equivalent
13
  "DISTANCE_METRIC": "IP",
14
  # Defines the initial capacity of the vector index. It helps in pre-allocating space for the index.
15
- "INITIAL_CAP": 440,
16
  # Max number of outgoing edges (connections) for each node in a graph layer.
17
- "M": 256,
18
  # Max number of connected neighbors to consider during graph building.
19
  # Higher values increase accuracy, but also increase index build time.
20
- "EF_CONSTRUCTION": 4096,
21
  # Max top candidates during KNN search. Higher values increase accuracy, but also increase search latency.
22
- "EF_RUNTIME": 6,
23
  }
24
 
25
  # Gradio launch parameters
 
1
  # Model config
2
  MODEL_NAME = "Derify/ChemMRL-alpha"
3
+ SUPPORTED_EMBEDDING_DIMENSIONS = [1024, 768, 512, 256, 128, 64, 32, 16]
4
  EMBEDDING_DIMENSION = max(SUPPORTED_EMBEDDING_DIMENSIONS)
5
  USE_HALF_PRECISION = True
6
 
7
  # HNSW index parameters
8
+ HNSW_K = 9
9
  HNSW_PARAMETERS = {
10
  # Embedding vector dtype
11
  "TYPE": "FLOAT16" if USE_HALF_PRECISION else "FLOAT32",
12
  # Embedding vectors are normalized so COSINE and IP are equivalent
13
  "DISTANCE_METRIC": "IP",
14
  # Defines the initial capacity of the vector index. It helps in pre-allocating space for the index.
15
+ "INITIAL_CAP": 15400,
16
  # Max number of outgoing edges (connections) for each node in a graph layer.
17
+ "M": 32,
18
  # Max number of connected neighbors to consider during graph building.
19
  # Higher values increase accuracy, but also increase index build time.
20
+ "EF_CONSTRUCTION": 512,
21
  # Max top candidates during KNN search. Higher values increase accuracy, but also increase search latency.
22
+ "EF_RUNTIME": HNSW_K,
23
  }
24
 
25
  # Gradio launch parameters
src/data.py CHANGED
@@ -4,10 +4,8 @@ import pandas as pd
4
 
5
  __data_dir = os.path.join(os.path.dirname(__file__), "data")
6
 
7
- __dataset_smiles_file = os.path.join(__data_dir, "dataset_smiles.csv")
8
  __sample_smiles_file = os.path.join(__data_dir, "sample_smiles.csv")
9
- __isomer_design_subset_file = os.path.join(__data_dir, "isomer_design_subset.csv")
10
 
11
- DATASET_SMILES = pd.read_csv(__dataset_smiles_file)
12
  SAMPLE_SMILES = pd.read_csv(__sample_smiles_file)
13
- ISOMER_DESIGN_SUBSET = pd.read_csv(__isomer_design_subset_file)
 
4
 
5
  __data_dir = os.path.join(os.path.dirname(__file__), "data")
6
 
 
7
  __sample_smiles_file = os.path.join(__data_dir, "sample_smiles.csv")
8
+ __isomer_design_dataset_file = os.path.join(__data_dir, "isomer_design_dataset.csv")
9
 
 
10
  SAMPLE_SMILES = pd.read_csv(__sample_smiles_file)
11
+ ISOMER_DESIGN_DATASET = pd.read_csv(__isomer_design_dataset_file, sep="\t")
src/data/isomer_design_dataset.csv ADDED
The diff for this file is too large to render. See raw diff
 
src/data/sample_smiles.csv CHANGED
@@ -1,4 +1,4 @@
1
- smiles,name,category
2
  CCO,Ethanol,Alcohol
3
  CC(=O)O,Acetic acid,Carboxylic acid
4
  c1ccccc1,Benzene,Aromatic hydrocarbon
 
1
+ smiles,name,properties
2
  CCO,Ethanol,Alcohol
3
  CC(=O)O,Acetic acid,Carboxylic acid
4
  c1ccccc1,Benzene,Aromatic hydrocarbon
src/service.py CHANGED
@@ -1,5 +1,6 @@
1
  import logging
2
  import os
 
3
  from typing import TypedDict
4
 
5
  import numpy as np
@@ -21,7 +22,7 @@ from constants import (
21
  SUPPORTED_EMBEDDING_DIMENSIONS,
22
  USE_HALF_PRECISION,
23
  )
24
- from data import DATASET_SMILES, ISOMER_DESIGN_SUBSET
25
 
26
 
27
  def setup_logger(clear_handler=False):
@@ -40,7 +41,7 @@ logger = setup_logger(clear_handler=True)
40
  class SimilarMolecule(TypedDict):
41
  smiles: str
42
  name: str
43
- category: str
44
  score: float
45
 
46
 
@@ -84,15 +85,24 @@ class MolecularEmbeddingService:
84
  password=redis_password,
85
  decode_responses=True,
86
  )
87
- redis_client.ping()
88
- return redis_client
89
  except Exception as e:
90
  logger.error(f"Failed to connect to Redis: {e}")
91
  raise
92
 
 
 
 
 
 
 
 
 
 
 
 
93
  def _initialize_datastore(self):
94
  self.__create_hnsw_index()
95
- self.__populate_sample_data(ISOMER_DESIGN_SUBSET)
96
 
97
  def __create_hnsw_index(self):
98
  """Create HNSW index for molecular embeddings"""
@@ -195,14 +205,14 @@ class MolecularEmbeddingService:
195
  query = (
196
  Query(f"*=>[KNN {k} @{self.embedding_field_name(embed_dim)} $vec AS score]")
197
  .sort_by("score")
198
- .return_fields("smiles", "name", "category", "score")
199
  .dialect(2)
200
  )
201
 
202
  results = self.redis_client.ft(self.index_name).search(query, query_params={"vec": query_vector})
203
 
204
  neighbors: list[SimilarMolecule] = [
205
- {"smiles": doc.smiles, "name": doc.name, "category": doc.category, "score": float(doc.score)}
206
  for doc in results.docs
207
  ]
208
 
@@ -212,6 +222,16 @@ class MolecularEmbeddingService:
212
  logger.error(f"Failed to find similar molecules: {e}")
213
  return []
214
 
 
 
 
 
 
 
 
 
 
 
215
  @staticmethod
216
  def embedding_field_name(dim: int) -> str:
217
  return f"embedding_{dim}"
 
1
  import logging
2
  import os
3
+ import time
4
  from typing import TypedDict
5
 
6
  import numpy as np
 
22
  SUPPORTED_EMBEDDING_DIMENSIONS,
23
  USE_HALF_PRECISION,
24
  )
25
+ from data import ISOMER_DESIGN_DATASET
26
 
27
 
28
  def setup_logger(clear_handler=False):
 
41
  class SimilarMolecule(TypedDict):
42
  smiles: str
43
  name: str
44
+ properties: str
45
  score: float
46
 
47
 
 
85
  password=redis_password,
86
  decode_responses=True,
87
  )
 
 
88
  except Exception as e:
89
  logger.error(f"Failed to connect to Redis: {e}")
90
  raise
91
 
92
+ while True:
93
+ try:
94
+ redis_client.ping()
95
+ break
96
+ except redis.exceptions.BusyLoadingError:
97
+ time_out = 5
98
+ logger.warning(f"Redis is loading the dataset in memory. Retrying in {time_out} seconds...")
99
+ time.sleep(time_out)
100
+
101
+ return redis_client
102
+
103
  def _initialize_datastore(self):
104
  self.__create_hnsw_index()
105
+ self.__populate_sample_data(ISOMER_DESIGN_DATASET)
106
 
107
  def __create_hnsw_index(self):
108
  """Create HNSW index for molecular embeddings"""
 
205
  query = (
206
  Query(f"*=>[KNN {k} @{self.embedding_field_name(embed_dim)} $vec AS score]")
207
  .sort_by("score")
208
+ .return_fields("smiles", "name", "properties", "score")
209
  .dialect(2)
210
  )
211
 
212
  results = self.redis_client.ft(self.index_name).search(query, query_params={"vec": query_vector})
213
 
214
  neighbors: list[SimilarMolecule] = [
215
+ {"smiles": doc.smiles, "name": doc.name, "properties": doc.properties, "score": float(doc.score)}
216
  for doc in results.docs
217
  ]
218
 
 
222
  logger.error(f"Failed to find similar molecules: {e}")
223
  return []
224
 
225
+ def get_canonical_smiles(self, smiles: str) -> str:
226
+ """Convert SMILES to canonical SMILES representation"""
227
+ if not smiles or smiles.strip() == "":
228
+ return ""
229
+
230
+ canonical = MorganFingerprinter.canonicalize_smiles(smiles.strip())
231
+ if canonical:
232
+ return canonical
233
+ return smiles.strip()
234
+
235
  @staticmethod
236
  def embedding_field_name(dim: int) -> str:
237
  return f"embedding_{dim}"
src/static/main.js CHANGED
@@ -270,7 +270,7 @@ window.setJSMESmiles = function (smiles) {
270
  window.clearJSME = function () {
271
  jsmeApplet?.reset();
272
  updateGradioTextbox("");
273
- return ["", [], [], "Cleared - Draw a new molecule or enter SMILES"];
274
  };
275
 
276
  // ============================================================================
 
270
  window.clearJSME = function () {
271
  jsmeApplet?.reset();
272
  updateGradioTextbox("");
273
+ return ["", "", [], [], "Cleared - Draw a new molecule or enter SMILES"];
274
  };
275
 
276
  // ============================================================================
src/static/main.min.js CHANGED
@@ -1 +1 @@
1
- function initializeJSME(){try{jsmeApplet=new JSApplet.JSME("jsme_container",getJsmeContainerWidthPx(),"450px",{options:"NOcanonize,rButton,zoom,zoomgui,newLook,star,multipart,polarnitro,NOexportInChI,NOexportInChIkey,NOsearchInChIkey,NOexportSVG,NOpaste"}),jsmeApplet.setCallBack("AfterStructureModified",handleJSMEStructureChange),jsmeApplet.setMenuScale(getJsmeGuiScale()),jsmeApplet.setUserInterfaceBackgroundColor("#adadad"),jsmeApplet.readGenericMolecularInput("CCO"),lastTextboxValue="CCO",setupTextboxEventListeners(),window.addEventListener("resize",handleResize)}catch(e){throw e}}function handleJSMEStructureChange(){try{updateGradioTextbox(jsmeApplet.smiles())}catch(e){}}function getJsmeGuiScale(){const e=getJsmeContainerWidthNumber();if(null==e||e<=0)return 1;let t;return t=e>460?1.3:e>420?1.1:e>370?1.05:e>300?.88:2,t}function getJsmeContainerWidthPx(){const e=getJsmeContainerWidthNumber();return null==e||e<=0?"100%":`${e}px`}function getJsmeContainerWidthNumber(){const e=document.getElementById("jsme_container");return e?.parentNode?.offsetWidth}function updateGradioTextbox(e){try{const t=document.querySelector(SMILES_INPUT_SELECTOR);if(t?.value===e)return;t.value=e,lastTextboxValue=e,GRADIO_CHANGE_EVENTS.forEach((e=>{const n=new Event(e,{bubbles:!0,cancelable:!0});t.dispatchEvent(n)}))}catch(e){}}function updateJSMEFromTextbox(e){try{""!==e?.trim()?jsmeApplet?.readGenericMolecularInput(e.trim()):jsmeApplet?.reset(),lastTextboxValue=e}catch(e){}}function setupTextboxEventListeners(){const e=document.querySelector(SMILES_INPUT_SELECTOR);e&&(e.addEventListener("input",handleTextboxChange),e.addEventListener("change",handleTextboxChange),e.addEventListener("paste",handleTextboxPaste),e.addEventListener("keyup",handleTextboxChange))}function handleTextboxChange(e){e.target.value!==lastTextboxValue&&updateJSMEFromTextbox(e.target.value)}function handleTextboxPaste(e){setTimeout((()=>{updateJSMEFromTextbox(e.target.value)}),50)}function handleResize(){try{jsmeApplet?.setMenuScale(getJsmeGuiScale()),jsmeApplet?.setWidth(getJsmeContainerWidthPx())}catch(e){}}function initializeWhenReady(){"undefined"!=typeof JSApplet&&JSApplet.JSME?initializeJSME():setTimeout(initializeWhenReady,2e3)}function startInitialization(){"loading"===document.readyState?document.addEventListener("DOMContentLoaded",(()=>{setTimeout(initializeWhenReady,2e3)})):setTimeout(initializeWhenReady,2e3)}let jsmeApplet=null,lastTextboxValue="";const DEFAULT_SMILES="CCO",CONTAINER_HEIGHT="450px",SMILES_INPUT_SELECTOR="#smiles_input textarea, #smiles_input input",PASTE_DELAY=50,INIT_RETRY_DELAY=2e3,GRADIO_CHANGE_EVENTS=["input","change","keyup"];window.setJSMESmiles=function(e){return updateJSMEFromTextbox(e),updateGradioTextbox(e),e},window.clearJSME=function(){return jsmeApplet?.reset(),updateGradioTextbox(""),["",[],[],"Cleared - Draw a new molecule or enter SMILES"]},startInitialization();
 
1
+ function initializeJSME(){try{jsmeApplet=new JSApplet.JSME("jsme_container",getJsmeContainerWidthPx(),"450px",{options:"NOcanonize,rButton,zoom,zoomgui,newLook,star,multipart,polarnitro,NOexportInChI,NOexportInChIkey,NOsearchInChIkey,NOexportSVG,NOpaste"}),jsmeApplet.setCallBack("AfterStructureModified",handleJSMEStructureChange),jsmeApplet.setMenuScale(getJsmeGuiScale()),jsmeApplet.setUserInterfaceBackgroundColor("#adadad"),jsmeApplet.readGenericMolecularInput("CCO"),lastTextboxValue="CCO",setupTextboxEventListeners(),window.addEventListener("resize",handleResize)}catch(e){throw e}}function handleJSMEStructureChange(){try{updateGradioTextbox(jsmeApplet.smiles())}catch(e){}}function getJsmeGuiScale(){const e=getJsmeContainerWidthNumber();if(null==e||e<=0)return 1;let t;return t=e>460?1.3:e>420?1.1:e>370?1.05:e>300?.88:2,t}function getJsmeContainerWidthPx(){const e=getJsmeContainerWidthNumber();return null==e||e<=0?"100%":`${e}px`}function getJsmeContainerWidthNumber(){const e=document.getElementById("jsme_container");return e?.parentNode?.offsetWidth}function updateGradioTextbox(e){try{const t=document.querySelector(SMILES_INPUT_SELECTOR);if(t?.value===e)return;t.value=e,lastTextboxValue=e,GRADIO_CHANGE_EVENTS.forEach((e=>{const n=new Event(e,{bubbles:!0,cancelable:!0});t.dispatchEvent(n)}))}catch(e){}}function updateJSMEFromTextbox(e){try{""!==e?.trim()?jsmeApplet?.readGenericMolecularInput(e.trim()):jsmeApplet?.reset(),lastTextboxValue=e}catch(e){}}function setupTextboxEventListeners(){const e=document.querySelector(SMILES_INPUT_SELECTOR);e&&(e.addEventListener("input",handleTextboxChange),e.addEventListener("change",handleTextboxChange),e.addEventListener("paste",handleTextboxPaste),e.addEventListener("keyup",handleTextboxChange))}function handleTextboxChange(e){e.target.value!==lastTextboxValue&&updateJSMEFromTextbox(e.target.value)}function handleTextboxPaste(e){setTimeout((()=>{updateJSMEFromTextbox(e.target.value)}),50)}function handleResize(){try{jsmeApplet?.setMenuScale(getJsmeGuiScale()),jsmeApplet?.setWidth(getJsmeContainerWidthPx())}catch(e){}}function initializeWhenReady(){"undefined"!=typeof JSApplet&&JSApplet.JSME?initializeJSME():setTimeout(initializeWhenReady,2e3)}function startInitialization(){"loading"===document.readyState?document.addEventListener("DOMContentLoaded",(()=>{setTimeout(initializeWhenReady,2e3)})):setTimeout(initializeWhenReady,2e3)}let jsmeApplet=null,lastTextboxValue="";const DEFAULT_SMILES="CCO",CONTAINER_HEIGHT="450px",SMILES_INPUT_SELECTOR="#smiles_input textarea, #smiles_input input",PASTE_DELAY=50,INIT_RETRY_DELAY=2e3,GRADIO_CHANGE_EVENTS=["input","change","keyup"];window.setJSMESmiles=function(e){return updateJSMEFromTextbox(e),updateGradioTextbox(e),e},window.clearJSME=function(){return jsmeApplet?.reset(),updateGradioTextbox(""),["","",[],[],"Cleared - Draw a new molecule or enter SMILES"]},startInitialization();