Spaces:

DerwenAI
/

textgraphs

Running

App Files Files Community

Paco Nathan commited on Feb 16, 2024

Commit

91eaff6

0 Parent(s):

A new start

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +37 -0
.github/FUNDING.yml +1 -0
.github/dependabot.yml +9 -0
.github/workflows/ci.yml +38 -0
.gitignore +173 -0
.pre-commit-config.yaml +37 -0
CITATION +8 -0
LICENSE +21 -0
MANIFEST.in +10 -0
NOTES.md +39 -0
PROMPT.md +15 -0
README.md +118 -0
SECURITY.md +14 -0
app.py +459 -0
bin/nb_md.sh +15 -0
bin/preview.py +50 -0
bin/push_pypi.sh +10 -0
bin/vis_doc.py +196 -0
demo.py +220 -0
docs/abstract.md +47 -0
docs/ack.md +11 -0
docs/assets/favicon.png +0 -0
docs/assets/hitl.png +0 -0
docs/assets/logo.png +0 -0
docs/assets/nouns/api.png +0 -0
docs/assets/nouns/biblio.png +0 -0
docs/assets/nouns/community.png +0 -0
docs/assets/nouns/concepts.png +0 -0
docs/assets/nouns/discovery.png +0 -0
docs/assets/nouns/evidence.png +0 -0
docs/assets/nouns/feedback.png +0 -0
docs/assets/nouns/howto.png +0 -0
docs/assets/nouns/tutorial.png +0 -0
docs/assets/textgraphs.graffle +3 -0
docs/biblio.md +232 -0
docs/build.md +132 -0
docs/conclude.md +53 -0
docs/details.md +64 -0
docs/ex0_0.md +689 -0
docs/ex0_0_files/ex0_0_17_0.svg +324 -0
docs/ex0_0_files/ex0_0_37_0.jpg +0 -0
docs/ex0_0_files/ex0_0_37_0.png +0 -0
docs/ex0_0_files/ex0_0_39_0.jpg +0 -0
docs/ex0_0_files/ex0_0_39_0.png +0 -0
docs/ex0_0_files/ex0_0_40_0.png +0 -0
docs/ex0_0_files/ex0_0_42_0.png +0 -0
docs/ex0_0_files/tmp.fig01.png +0 -0
docs/ex0_0_files/tmp.fig02.png +0 -0
docs/ex1_0.md +776 -0
docs/ex1_0_files/ex1_0_22_0.png +0 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,37 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.graffle filter=lfs diff=lfs merge=lfs -text
+docs/assets/textgraphs.graffle filter=lfs diff=lfs merge=lfs -text

.github/FUNDING.yml ADDED Viewed

	@@ -0,0 +1 @@


1	+ github: ceteri

.github/dependabot.yml ADDED Viewed

	@@ -0,0 +1,9 @@

+# Please see the documentation for all configuration options:
+# https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
+version: 2
+updates:
+  - package-ecosystem: "pip"
+    directory: "/"
+    schedule:
+      interval: "daily"

.github/workflows/ci.yml ADDED Viewed

	@@ -0,0 +1,38 @@

+name: CI
+on: [pull_request, workflow_dispatch]
+jobs:
+#  pre-commit:
+#    name: Run pre-commit
+#    runs-on: ubuntu-latest
+#    steps:
+#    - uses: actions/checkout@v3
+#    - uses: actions/setup-python@v3
+#    - uses: pre-commit/action@v3.0.0
+  test:
+    name: Tests for Python ${{ matrix.python-version }}
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ['3.10']
+      fail-fast: false
+#    needs: pre-commit
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          pip install -e .
+          pip install -r requirements-dev.txt
+      - name: Run tests
+        run: |
+          pytest

.gitignore ADDED Viewed

	@@ -0,0 +1,173 @@

+# local files
+*~
+chromedriver
+lemma.json
+lemma.ttl
+lemma.zip
+lemma_graph.zip
+examples/tmp.*.html
+vis.html
+gor.html
+txg.tgz
+s2v_old/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+# See https://pre-commit.com for more information
+# See https://pre-commit.com/hooks.html for more hooks
+default_stages: [commit, push]
+default_language_version:
+    python: python3
+exclude: "deprecated"
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.4.0
+    hooks:
+    -   id: trailing-whitespace
+        exclude: ^docs/
+    -   id: check-builtin-literals
+    -   id: check-executables-have-shebangs
+    -   id: check-merge-conflict
+    -   id: check-json
+    -   id: check-yaml
+    -   id: debug-statements
+    -   id: detect-private-key
+-   repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.4.1
+    hooks:
+    -   id: mypy # type annotations
+        exclude: ^tests/,^venv/
+-   repo: https://github.com/PyCQA/pylint
+    rev: v2.17.4
+    hooks:
+    -   id: pylint
+        exclude: error.py
+-   repo: https://github.com/codespell-project/codespell
+    rev: v2.2.4
+    hooks:
+    -   id: codespell # spell-check source code
+        args: ["-L", "basf,textgraph,udo"] # comma separated stop words
+        exclude: ^README.md|^NOTES.md|^examples|^docs/ack.md|^docs/biblio.md
+        language: python
+        types: [text]

CITATION ADDED Viewed

	@@ -0,0 +1,8 @@

+@software{TextGraphs,
+  author = {Paco Nathan},
+  title = {{TextGraphs + LLMs + graph ML for entity extraction, linking, ranking, and constructing a lemma graph}},
+  year = 2023,
+  publisher = {Derwen},
+  doi = {10.5281/zenodo.10431783},
+  url = {https://github.com/DerwenAI/textgraphs}
+}

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023-2024 Derwen, Inc.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

MANIFEST.in ADDED Viewed

	@@ -0,0 +1,10 @@

+include LICENSE
+include README.md
+include pyproject.toml
+include requirements.txt
+include setup.py
+include tests/*.py
+include textgraphs/*.py
+prune .ipynb_checkpoints
+prune docs
+prune venv

NOTES.md ADDED Viewed

	@@ -0,0 +1,39 @@

+TODO:
+  * can we build a causal graph of the provenance?
+    - https://www.pywhy.org/dowhy/v0.11.1/
+  * target publications:
+    - https://drops.dagstuhl.de/entities/issue/TGDK-volume-1-issue-1
+  * impl a _semantic random walk_ from a source KG
+  * link entities for lemmas, noun chunks using MediaWiki lookups?
+    - apply default semantics: `skos:related`
+  * eval clustering/community detection for GOR?
+    - https://github.com/MengLiuPurdue/LocalGraphClustering
+  * RAG example
+    - https://docs.llamaindex.ai/en/latest/examples/index_structs/knowledge_graph/KuzuGraphDemo.html#query-with-embeddings
+  * extend GOR to replicate NodePiece/ULTRA ?
+  * reify GOR, then use FastRP to generate embeddings?
+    - https://github.com/Knorreman/fastRP
+  * eval community detection to condense nodes using k-medoids?
+    - https://medium.com/neo4j/clustering-graph-data-with-k-medoids-3b6a67ea0873
+  * add conda packaging
+    - https://conda.github.io/grayskull/
+  * SPARQL the DBPedia/Wikidata equivs
+  * other NER/RE:
+    - https://github.com/dwadden/dygiepp?tab=readme-ov-file#pretrained-models
+  * check out https://github.com/wikipedia2vec/wikipedia2vec
+  * link `sense2vec` synonyms; make affordances for UI to annotate synonyms

PROMPT.md ADDED Viewed

	@@ -0,0 +1,15 @@

+https://medium.com/@nizami_muhammad/extracting-relation-from-sentence-using-llm-597d0c0310a8
+Sentence: Werner Herzog is the son of Dietrich Herzog
+Extract RDF predicate from the sentence in this format:
+subject:<subject>
+predicate:<predicate>
+object:<object, optional>
+---
+Sentence: Werner Herzog is a remarkable filmmaker and an intellectual originally from Germany, the son of Dietrich Herzog. After the war, Werner fled to America to become famous. Instead he became President and decided to nuke Slovenia.
+Be brief, extract the top RDF predicate in DBPedia for the relation between <http://dbpedia.org/resource/Werner_Herzog><http://dbpedia.org/resource/Germany>  in this format:
+subject:<subject>
+predicate:<predicate>
+object:<object, optional>

README.md ADDED Viewed

	@@ -0,0 +1,118 @@

+---
+title: TextGraphs
+emoji: ✴
+colorFrom: green
+colorTo: gray
+sdk: streamlit
+sdk_version: 1.28.2
+app_file: app.py
+pinned: false
+license: mit
+---
+# TextGraphs
+[![DOI](https://zenodo.org/badge/735568863.svg)](https://zenodo.org/doi/10.5281/zenodo.10431783)
+![Licence](https://img.shields.io/github/license/DerwenAI/textgraphs)
+[![Checked with mypy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/)
+![CI](https://github.com/DerwenAI/textgraphs/workflows/CI/badge.svg)
+<br/>
+![Repo size](https://img.shields.io/github/repo-size/DerwenAI/textgraphs)
+![downloads](https://img.shields.io/pypi/dm/textgraphs)
+![sponsor](https://img.shields.io/github/sponsors/ceteri)
+<img
+ alt="TextGraphs logo"
+ src="https://raw.githubusercontent.com/DerwenAI/textgraphs/main/docs/assets/logo.png"
+ width="231"
+/>
+## project info
+Project home: <https://huggingface.co/spaces/DerwenAI/textgraphs>
+Full documentation: <https://derwen.ai/docs/txg/>
+Sample code is provided in `demo.py`
+## requirements
+  * Python 3.10+
+## deploy library from PyPi
+Prepare the virtual environment:
+```bash
+python3 -m venv venv
+source venv/bin/activate
+python3 -m pip install -U pip wheel setuptools
+```
+Install from [PyPi](https://pypi.python.org/pypi/textgraphs):
+```bash
+python3 -m pip install -U textgraphs
+```
+## run demos locally
+```bash
+python3 demo.py
+```
+```bash
+streamlit run app.py
+```
+## install library from source locally
+```bash
+python3 -m venv venv
+source venv/bin/activate
+python3 -m pip install -U pip wheel setuptools
+python3 -m pip install -e .
+```
+To run the Streamlit or JupyterLab demos, also install:
+```bash
+python3 -m pip install -r requirements-dev.txt
+```
+## license and copyright
+Source code for **TextGraphs** plus its logo, documentation, and
+examples have an [MIT license](https://spdx.org/licenses/MIT.html)
+which is succinct and simplifies use in commercial applications.
+All materials herein are Copyright &copy; 2023-2024 Derwen, Inc.
+## attribution
+Please use the following BibTeX entry for citing **TextGraphs** if you
+use it in your research or software:
+```bibtex
+@software{TextGraphs,
+  author = {Paco Nathan},
+  title = {{TextGraphs + LLMs + graph ML for entity extraction, linking, ranking, and constructing a lemma graph}},
+  year = 2023,
+  publisher = {Derwen},
+  doi = {10.5281/zenodo.10431783},
+  url = {https://github.com/DerwenAI/textgraphs}
+}
+```
+## star history
+[![Star History Chart](https://api.star-history.com/svg?repos=derwenai/textgraphs&type=Date)](https://star-history.com/#derwenai/textgraphs&Date)

SECURITY.md ADDED Viewed

	@@ -0,0 +1,14 @@

+# Security Policy
+## Supported Versions
+Versions which are currently being supported with security updates:
+| Version | Supported          |
+| ------- | ------------------ |
+| > 0.2   | :white_check_mark: |
+## Reporting a Vulnerability
+To report a vulnerability, please create a new [*issue*](https://github.com/DerwenAI/textgraphs/issues).
+We will be notified immediately, and will attempt to respond on the reported issue immediately.

app.py ADDED Viewed

	@@ -0,0 +1,459 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# pylint: disable=C0301
+"""
+HuggingFace Spaces demo of the `TextGraphs` library using Streamlit
+see copyright/license https://huggingface.co/spaces/DerwenAI/textgraphs/blob/main/README.md
+"""
+import pathlib
+import time
+import typing
+import matplotlib.pyplot as plt  # pylint: disable=E0401
+import pandas as pd  # pylint: disable=E0401
+import pyvis  # pylint: disable=E0401
+import spacy  # pylint: disable=E0401
+import streamlit as st  # pylint: disable=E0401
+import textgraphs
+if __name__ == "__main__":
+    # default text input
+    SRC_TEXT: str = """
+Werner Herzog is a remarkable filmmaker and intellectual originally from Germany, the son of Dietrich Herzog.
+    """
+    # store the initial value of widgets in session state
+    if "visibility" not in st.session_state:
+        st.session_state.visibility = "visible"
+        st.session_state.disabled = False
+    with st.container():
+        st.title("demo: TextGraphs + LLMs to construct a 'lemma graph'")
+        st.markdown(
+            """
+docs: <https://derwen.ai/docs/txg/>
+&nbsp; &nbsp;
+DOI: 10.5281/zenodo.10431783
+            """,
+            unsafe_allow_html = True,
+        )
+        # collect input + config
+        st.subheader("configure", divider = "rainbow")
+        text_input: str = st.text_area(
+            "Source Text:",
+            value = SRC_TEXT.strip(),
+        )
+        llm_ner = st.checkbox(
+            "enhance spaCy NER using: SpanMarker",
+            value = False,
+        )
+        link_ents = st.checkbox(
+            "link entities using: DBPedia Spotlight, WikiMedia API",
+            value = False,
+        )
+        infer_rel = st.checkbox(
+            "infer relations using: REBEL, OpenNRE, qwikidata",
+            value = False,
+        )
+        if text_input or llm_ner or link_ents or infer_rel:
+            ## parse the document
+            st.subheader("parse the raw text", divider = "rainbow")
+            start_time: float = time.time()
+            # generally it is fine to use factory defaults,
+            # although let's illustrate these settings here
+            infer_rels: list = []
+            if infer_rel:
+                with st.spinner(text = "load rel models..."):
+                    infer_rels = [
+                        textgraphs.InferRel_OpenNRE(
+                            model = textgraphs.OPENNRE_MODEL,
+                            max_skip = textgraphs.MAX_SKIP,
+                            min_prob = textgraphs.OPENNRE_MIN_PROB,
+                        ),
+                        textgraphs.InferRel_Rebel(
+                            lang = "en_XX",
+                            mrebel_model = textgraphs.MREBEL_MODEL,
+                        ),
+                    ]
+            ner: typing.Optional[ textgraphs.Component ] = None
+            if llm_ner:
+                ner = textgraphs.NERSpanMarker(
+                    ner_model = textgraphs.NER_MODEL,
+                )
+            tg: textgraphs.TextGraphs = textgraphs.TextGraphs(
+                factory = textgraphs.PipelineFactory(
+                    spacy_model = textgraphs.SPACY_MODEL,
+                    ner = ner,
+                    kg = textgraphs.KGWikiMedia(
+                        spotlight_api = textgraphs.DBPEDIA_SPOTLIGHT_API,
+                        dbpedia_search_api = textgraphs.DBPEDIA_SEARCH_API,
+                        dbpedia_sparql_api = textgraphs.DBPEDIA_SPARQL_API,
+                        wikidata_api = textgraphs.WIKIDATA_API,
+                        min_alias = textgraphs.DBPEDIA_MIN_ALIAS,
+                        min_similarity = textgraphs.DBPEDIA_MIN_SIM,
+                    ),
+                    infer_rels = infer_rels,
+                ),
+            )
+            duration: float = round(time.time() - start_time, 3)
+            st.write(f"set up: {round(duration, 3)} sec")
+            with st.spinner(text = "parse text..."):
+                start_time = time.time()
+                pipe: textgraphs.Pipeline = tg.create_pipeline(
+                    text_input.strip(),
+                )
+            duration = round(time.time() - start_time, 3)
+            st.write(f"parse text: {round(duration, 3)} sec, {len(text_input)} characters")
+            # render the entity html
+            ent_html: str = spacy.displacy.render(
+                pipe.ner_doc,
+                style = "ent",
+                jupyter = False,
+            )
+            st.markdown(
+                ent_html,
+                unsafe_allow_html = True,
+            )
+            # generate dependencies as an SVG
+            dep_svg = spacy.displacy.render(
+                pipe.ner_doc,
+                style = "dep",
+                jupyter = False,
+            )
+            st.image(
+                dep_svg,
+                width = 800,
+                use_column_width = "never",
+            )
+            ## collect graph elements from the parse
+            st.subheader("construct the base level of the lemma graph", divider = "rainbow")
+            start_time = time.time()
+            tg.collect_graph_elements(
+                pipe,
+                debug = False,
+            )
+            duration = round(time.time() - start_time, 3)
+            st.write(f"collect elements: {round(duration, 3)} sec, {len(tg.nodes)} nodes, {len(tg.edges)} edges")
+            ## perform entity linking
+            if link_ents:
+                st.subheader("extract entities and perform entity linking", divider = "rainbow")
+                with st.spinner(text = "entity linking..."):
+                    start_time = time.time()
+                    tg.perform_entity_linking(
+                        pipe,
+                        debug = False,
+                    )
+                duration = round(time.time() - start_time, 3)
+                st.write(f"entity linking: {round(duration, 3)} sec")
+            ## perform relation extraction
+            if infer_rel:
+                st.subheader("infer relations", divider = "rainbow")
+                st.write("NB: this part runs an order of magnitude more *slooooooowly* on HF Spaces")
+                with st.spinner(text = "relation extraction..."):
+                    start_time = time.time()
+                    # NB: run this iteratively since Streamlit on HF Spaces is *sloooooooooow*
+                    inferred_edges: list = tg.infer_relations(
+                        pipe,
+                        debug = False,
+                    )
+                duration = round(time.time() - start_time, 3)
+                n_list: list = list(tg.nodes.values())
+                df_rel: pd.DataFrame = pd.DataFrame.from_dict([
+                    {
+                        "src": n_list[edge.src_node].text,
+                        "dst": n_list[edge.dst_node].text,
+                        "rel": edge.rel,
+                        "weight": edge.prob,
+                    }
+                    for edge in inferred_edges
+                ])
+                st.dataframe(df_rel)
+                st.write(f"relation extraction: {round(duration, 3)} sec, {len(df_rel)} edges")
+            ## construct the _lemma graph_
+            start_time = time.time()
+            tg.construct_lemma_graph(
+                debug = False,
+            )
+            duration = round(time.time() - start_time, 3)
+            st.write(f"construct graph: {round(duration, 3)} sec")
+            ## rank the extracted phrases
+            st.subheader("rank the extracted phrases", divider = "rainbow")
+            start_time = time.time()
+            tg.calc_phrase_ranks(
+                pr_alpha = textgraphs.PAGERANK_ALPHA,
+                debug = False,
+            )
+            df_ent: pd.DataFrame = tg.get_phrases_as_df()
+            duration = round(time.time() - start_time, 3)
+            st.write(f"extract: {round(duration, 3)} sec, {len(df_ent)} entities")
+            st.dataframe(df_ent)
+            ## generate a word cloud
+            st.subheader("generate a word cloud", divider = "rainbow")
+            render: textgraphs.RenderPyVis = tg.create_render()
+            wordcloud = render.generate_wordcloud()
+            st.image(
+                wordcloud.to_image(),
+                width = 700,
+                use_column_width = "never",
+            )
+            ## visualize the lemma graph
+            st.subheader("visualize the lemma graph", divider = "rainbow")
+            st.markdown(
+                """
+                what you get at this stage is a relatively noisy,
+                low-level detailed graph of the parsed text
+                the most interesting nodes will probably be either
+                subjects (`nsubj`) or direct objects (`pobj`)
+                """
+            )
+            pv_graph: pyvis.network.Network = render.render_lemma_graph(
+                debug = False,
+            )
+            pv_graph.force_atlas_2based(
+                gravity = -38,
+                central_gravity = 0.01,
+                spring_length = 231,
+                spring_strength = 0.7,
+                damping = 0.8,
+                overlap = 0,
+            )
+            pv_graph.show_buttons(filter_ = [ "physics" ])
+            pv_graph.toggle_physics(True)
+            py_html: pathlib.Path = pathlib.Path("vis.html")
+            pv_graph.save_graph(py_html.as_posix())
+            st.components.v1.html(
+                py_html.read_text(encoding = "utf-8"),
+                height = render.HTML_HEIGHT_WITH_CONTROLS,
+                scrolling = False,
+            )
+            ## cluster the communities
+            st.subheader("cluster the communities", divider = "rainbow")
+            st.markdown(
+                """
+<details>
+  <summary><strong>About this clustering...</strong></summary>
+  <p>
+In the tutorial
+<a href="https://towardsdatascience.com/how-to-convert-any-text-into-a-graph-of-concepts-110844f22a1a" target="_blank">"How to Convert Any Text Into a Graph of Concepts"</a>,
+Rahul Nayak uses the
+<a href="https://en.wikipedia.org/wiki/Girvan%E2%80%93Newman_algorithm"><em>girvan-newman</em></a>
+algorithm to split the graph into communities, then clusters on those communities.
+His approach works well for unsupervised clustering of key phrases which have been extracted from a collection of many documents.
+  </p>
+  <p>
+While Nayak was working with entities extracted from "chunks" of text, not with a text graph per se, this approach is useful for identifying network motifs which can be condensed, e.g., to extract a semantic graph overlay as an <em>abstraction layer</em> atop a lemma graph.
+  </p>
+</details>
+<br/>
+                """,
+                unsafe_allow_html = True,
+            )
+            spring_dist_val = st.slider(
+                "spring distance for NetworkX clusters",
+                min_value = 0.0,
+                max_value = 10.0,
+                value = 1.2,
+            )
+            if spring_dist_val:
+                start_time = time.time()
+                fig, ax = plt.subplots()
+                comm_map: dict = render.draw_communities(
+                    spring_distance = spring_dist_val,
+                )
+                st.pyplot(fig)
+                duration = round(time.time() - start_time, 3)
+                st.write(f"cluster: {round(duration, 3)} sec, {max(comm_map.values()) + 1} clusters")
+            ## transform a graph of relations
+            st.subheader("transform as a graph of relations", divider = "rainbow")
+            st.markdown(
+                """
+Using the topological transform given in `lee2023ingram`, construct a
+_graph of relations_ for enhancing graph inference.
+<details>
+  <summary><strong>What does this transform provide?</strong></summary>
+  <p>
+By using a <em>graph of relations</em> dual representation of our graph data, first and foremost we obtain a more compact representation of the relations in the graph, and means of making inferences (e.g., <em>link prediction</em>) where there is substantially more invariance in the training data.
+  </p>
+  <p>
+Also recognize that for a parse graph of a paragraph in the English language, the most interesting nodes will probably be either subjects (<code>nsubj</code>) or direct objects (<code>pobj</code>). Here in the <em>graph of relations</em> we can see illustrated how the important details from <em>entity linking</em> tend to cluster near either <code>nsubj</code> or <code>pobj</code> entities, connected through punctuation. This aspect is not as readily observed in the earlier visualization of the <em>lemma graph</em>.
+  </p>
+</details>
+                """,
+                unsafe_allow_html = True,
+            )
+            start_time = time.time()
+            gor: textgraphs.GraphOfRelations = textgraphs.GraphOfRelations(tg)
+            gor.seeds()
+            gor.construct_gor()
+            scores: typing.Dict[ tuple, float ] = gor.get_affinity_scores()
+            pv_graph = gor.render_gor_pyvis(scores)
+            pv_graph.force_atlas_2based(
+                gravity = -38,
+                central_gravity = 0.01,
+                spring_length = 231,
+                spring_strength = 0.7,
+                damping = 0.8,
+                overlap = 0,
+            )
+            pv_graph.show_buttons(filter_ = [ "physics" ])
+            pv_graph.toggle_physics(True)
+            py_html = pathlib.Path("gor.html")
+            pv_graph.save_graph(py_html.as_posix())
+            st.components.v1.html(
+                py_html.read_text(encoding = "utf-8"),
+                height = render.HTML_HEIGHT_WITH_CONTROLS,
+                scrolling = False,
+            )
+            duration = round(time.time() - start_time, 3)
+            st.write(f"transform: {round(duration, 3)} sec, {len(gor.rel_list)} relations")
+            ## download lemma graph
+            st.subheader("download the results", divider = "rainbow")
+            st.markdown(
+                """
+Download a serialized <em>lemma graph</em> in multiple formats:
+                """,
+                unsafe_allow_html = True,
+            )
+            col1, col2, col3 = st.columns(3)
+            with col1:
+                st.download_button(
+                    label = "download node-link",
+                    data = tg.dump_lemma_graph(),
+                    file_name = "lemma_graph.json",
+                    mime = "application/json",
+                )
+                st.markdown(
+                    """
+<a href="https://networkx.org/documentation/stable/reference/readwrite/generated/networkx.readwrite.json_graph.node_link_data.html" target="_blank"><em>node-link</em></a>: JSON data suitable for import to <a href="https://neo4j.com/docs/getting-started/data-import/csv-import/" target="_blank"><em>Neo4j</em></a>, <a href="https://networkx.org/documentation/stable/reference/readwrite/generated/networkx.readwrite.json_graph.node_link_graph.html#networkx.readwrite.json_graph.node_link_graph" target="_blank"><em>NetworkX</em></a>, etc.
+                    """,
+                    unsafe_allow_html = True,
+                )
+            with col2:
+                st.download_button(
+                    label = "download RDF",
+                    data = tg.export_rdf(),
+                    file_name = "lemma_graph.ttl",
+                    mime = "text/turtle",
+                )
+                st.markdown(
+                    """
+<a href="https://www.w3.org/TR/turtle/" target="_blank"><em>Turtle/N3</em></a>: W3C semantic graph representation, based on RDF, OWL, SKOS, etc.
+                    """,
+                    unsafe_allow_html = True,
+                )
+            with col3:
+                st.download_button(
+                    label = "download KùzuDB",
+                    data = tg.export_kuzu(zip_name = "lemma_graph.zip"),
+                    file_name = "lemma.zip",
+                    mime = "application/x-zip-compressed",
+                )
+                st.markdown(
+                    """
+<a href="https://opencypher.org/" target="_blank"><em>openCypher</em></a>: ZIP file of a labeled property graph in <a href="https://kuzudb.com/" target="_blank"><em>KùzuDB</em></a>
+                    """,
+                    unsafe_allow_html = True,
+                )
+            ## WIP
+            st.divider()
+            st.write("(WIP)")
+            thanks: str = """
+This demo has completed, and thank you for running a Derwen space!
+            """
+            st.toast(
+                thanks,
+                icon ="😍",
+            )

bin/nb_md.sh ADDED Viewed

	@@ -0,0 +1,15 @@

+#!/bin/bash -e -x
+for notebook_path in examples/*.ipynb; do
+    [ -e "$notebook_path" ] || continue
+    notebook=`basename $notebook_path`
+    stem=`basename $notebook_path .ipynb`
+    cp $notebook_path docs/$notebook
+    jupyter nbconvert docs/$notebook --to markdown
+    #exit 0
+    python3 bin/vis_doc.py docs/"$stem".md
+    rm docs/$notebook
+done

bin/preview.py ADDED Viewed

	@@ -0,0 +1,50 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+Preview the `MkDocs` build of the online documentation.
+"""
+from pathlib import PurePosixPath
+import os
+from flask import Flask, redirect, send_from_directory, url_for  # pylint: disable=E0401
+DOCS_ROUTE = "/docs/"
+DOCS_FILES = "../site"
+DOCS_PORT = 8000
+APP = Flask(__name__, static_folder=DOCS_FILES, template_folder=DOCS_FILES)
+APP.config["DEBUG"] = False
+APP.config["MAX_CONTENT_LENGTH"] = 52428800
+APP.config["SECRET_KEY"] = "Technically, I remain uncommitted."
+APP.config["SEND_FILE_MAX_AGE_DEFAULT"] = 3000
+@APP.route(DOCS_ROUTE, methods=["GET"])
+@APP.route(DOCS_ROUTE + "<path:path>", methods=["GET"], defaults={"path": None})
+@APP.route(DOCS_ROUTE + "<path:path>", methods=["GET"])
+def static_proxy (path=""):
+    """static route for an asset"""
+    if not path:
+        suffix = ""
+    else:
+        suffix = PurePosixPath(path).suffix
+    if suffix not in [".css", ".js", ".map", ".png", ".svg", ".xml"]:
+        path = os.path.join(path, "index.html")
+    return send_from_directory(DOCS_FILES, path)
+@APP.route("/index.html")
+@APP.route("/home/")
+@APP.route("/")
+def home_redirects ():
+    """redirect for home page"""
+    return redirect(url_for("static_proxy"))
+if __name__ == "__main__":
+    APP.run(host="0.0.0.0", port=DOCS_PORT, debug=True)

bin/push_pypi.sh ADDED Viewed

	@@ -0,0 +1,10 @@

+#!/bin/bash -e -x
+rm -rf dist build textgraphs.egg-info
+python3 -m build
+twine check dist/*
+# this assumes the use of `~/.pypirc`
+# https://packaging.python.org/en/latest/specifications/pypirc/
+twine upload ./dist/* --verbose

bin/vis_doc.py ADDED Viewed

	@@ -0,0 +1,196 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+Convert the markdown generated from Jupyter notebooks to preserve
+rendered images, etc.
+"""
+import os
+import pathlib
+import re
+import sys
+import time
+import traceback
+import typing
+from icecream import ic  # pylint: disable=E0401
+from selenium import webdriver  # pylint: disable=E0401
+class Converter:
+    """
+HTML/Markdown conversion
+    """
+    PAT_HEADER = re.compile(r"^(```python\n\# for use.*production:\n.*\n```\n)", re.MULTILINE)
+    PAT_SOURCE = re.compile(r"\s+src\=\"(\S+)\"")
+    REPLACEMENT_HEADER: str = """
+!!! note
+    To run this notebook in JupyterLab, load [`examples/{}.ipynb`]({}/examples/{}.ipynb)
+    """
+    def __init__ (
+        self,
+        src_url: str,
+        ) -> None:
+        """
+Constructor.
+        """
+        self.src_url: str = src_url
+    def replace_sys_header (
+        self,
+        text: str,
+        stem: str,
+        *,
+        debug: bool = False,
+        ) -> str:
+        """
+Replace the initial cell in a tutorial notebook.
+        """
+        output: typing.List[ str ] = []
+        for chunk in self.PAT_HEADER.split(text):
+            m_header: typing.Optional[ re.Match ] = self.PAT_HEADER.match(chunk)
+            if debug:
+                ic(m_header)
+            if m_header:
+                header: str = self.REPLACEMENT_HEADER.format(stem, self.src_url, stem)
+                output.append(header)
+            else:
+                output.append(chunk)
+        return "\n".join(output)
+    def get_pyvis_html (
+        self,
+        iframe: str,
+        *,
+        debug: bool = False,
+        ) -> str:
+        """
+Locate the HTML files generated by `PyVis` if any.
+This assumes the HTML files are named `tmp.fig*.*`
+        """
+        source_html: typing.Optional[ str ] = None
+        m_source: typing.Optional[ re.Match ] = self.PAT_SOURCE.search(iframe)
+        if m_source:
+            source_html = m_source.group(1)
+            if debug:
+                ic(source_html)
+            if "tmp.fig" not in source_html:  # type: ignore
+                # <iframe/> wasn't generated by PyVis
+                source_html = None
+        return source_html  # type: ignore
+    def render_screenshot (
+        self,
+        source_html: str,
+        source_png,
+        ) -> None:
+        """
+use Selenium to render `source_png` from `source_html`
+        """
+        #chrome_path = os.getcwd() + "/chromedriver"
+        #chrome_options = Options()
+        browser: webdriver.Chrome = webdriver.Chrome()
+        browser.get(source_html)
+        time.sleep(10)
+        browser.get_screenshot_as_file(source_png)
+        browser.quit()
+    def replace_pyvis_iframe (
+        self,
+        text: str,
+        parent: pathlib.Path,
+        stem: str,
+        *,
+        debug: bool = False,
+        ) -> str:
+        """
+Substitute static images for the rendered graphs.
+        """
+        output: typing.List[ str ] = []
+        in_iframe: bool = False
+        for line in text.split("\n"):
+            if line.startswith("<iframe"):
+                in_iframe = True
+            if not in_iframe:
+                output.append(line)
+            elif line.strip().startswith("src="):
+                src_html: str = self.get_pyvis_html(line)
+                src_png: str = src_html.replace(".html", ".png")
+                if debug:
+                    ic(src_png)
+                try:
+                    os.mkdir(f"{parent}/{stem}_files")
+                except:  # pylint: disable=W0702
+                    pass
+                self.render_screenshot(
+                    f"file://{os.getcwd()}/examples/{src_html}",
+                    f"{parent}/{stem}_files/{src_png}",
+                )
+                output.append(f"![png]({stem}_files/{src_png})")
+            if line.startswith("></iframe>"):
+                in_iframe = False
+        return "\n".join(output)
+if __name__ == "__main__":
+    try:
+        conv: Converter = Converter(
+            "https://github.com/DerwenAI/textgraphs/blob/main",
+        )
+        filename: pathlib.Path = pathlib.Path(sys.argv[1])
+        _parent: pathlib.Path = filename.parent
+        _stem: str = filename.stem
+        ic(filename, _parent, _stem)
+        with open(filename, "r", encoding = "utf-8") as fp:
+            html: str = fp.read()
+        html = conv.replace_sys_header(  # pylint: disable=C0103
+            html,
+            _stem,
+            debug = False, # True
+        )
+        #print(text)
+        #sys.exit(0)
+        html = conv.replace_pyvis_iframe(  # pylint: disable=C0103
+            html,
+            _parent,
+            _stem,
+            debug = True, # False
+        )
+        with open(filename, "w", encoding = "utf-8") as fp:
+            fp.write(html)
+    except Exception as ex:  # pylint: disable=W0718
+        ic(ex)
+        traceback.print_exc()

demo.py ADDED Viewed

	@@ -0,0 +1,220 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+Sample application to demo the `TextGraphs` library.
+see copyright/license https://huggingface.co/spaces/DerwenAI/textgraphs/blob/main/README.md
+"""
+import asyncio
+import sys  # pylint: disable=W0611
+import traceback
+import time
+import typing
+from icecream import ic  # pylint: disable=E0401
+from pyinstrument import Profiler  # pylint: disable=E0401
+import matplotlib.pyplot as plt  # pylint: disable=E0401
+import pandas as pd  # pylint: disable=E0401
+import textgraphs
+if __name__ == "__main__":
+    SRC_TEXT: str = """
+Werner Herzog is a remarkable filmmaker and an intellectual originally from Germany, the son of Dietrich Herzog.
+After the war, Werner fled to America to become famous.
+"""
+    ## set up
+    ## NB: profiler raises handler exceptions when `concur = False`
+    debug: bool = False  # True
+    concur: bool = True  # False
+    profile: bool = True  # False
+    if profile:
+        profiler: Profiler = Profiler()
+        profiler.start()
+    try:
+        start_time: float = time.time()
+        tg: textgraphs.TextGraphs = textgraphs.TextGraphs(
+            factory = textgraphs.PipelineFactory(
+                spacy_model = textgraphs.SPACY_MODEL,
+                ner = None, #textgraphs.NERSpanMarker(),
+                kg = textgraphs.KGWikiMedia(
+                    spotlight_api = textgraphs.DBPEDIA_SPOTLIGHT_API,
+                    dbpedia_search_api = textgraphs.DBPEDIA_SEARCH_API,
+                    dbpedia_sparql_api = textgraphs.DBPEDIA_SPARQL_API,
+                    wikidata_api = textgraphs.WIKIDATA_API,
+                ),
+                infer_rels = [
+                    textgraphs.InferRel_OpenNRE(
+                        model = textgraphs.OPENNRE_MODEL,
+                        max_skip = textgraphs.MAX_SKIP,
+                        min_prob = textgraphs.OPENNRE_MIN_PROB,
+                    ),
+                    textgraphs.InferRel_Rebel(
+                        lang = "en_XX",
+                        mrebel_model = textgraphs.MREBEL_MODEL,
+                    ),
+                ],
+            ),
+        )
+        duration: float = round(time.time() - start_time, 3)
+        print(f"{duration:7.3f} sec: set up")
+        ## NLP parse
+        start_time = time.time()
+        pipe: textgraphs.Pipeline = tg.create_pipeline(
+            SRC_TEXT.strip(),
+        )
+        duration = round(time.time() - start_time, 3)
+        print(f"{duration:7.3f} sec: parse text")
+        ## collect graph elements from the parse
+        start_time = time.time()
+        tg.collect_graph_elements(
+            pipe,
+            debug = debug,
+        )
+        duration = round(time.time() - start_time, 3)
+        print(f"{duration:7.3f} sec: collect elements")
+        ## perform entity linking
+        start_time = time.time()
+        tg.perform_entity_linking(
+            pipe,
+            debug = debug,
+        )
+        duration = round(time.time() - start_time, 3)
+        print(f"{duration:7.3f} sec: entity linking")
+        ## perform concurrent relation extraction
+        start_time = time.time()
+        if concur:
+            try:
+                loop = asyncio.get_running_loop()
+            except RuntimeError:
+                loop = asyncio.new_event_loop()
+                asyncio.set_event_loop(loop)
+            inferred_edges: list = loop.run_until_complete(
+                tg.infer_relations_async(
+                    pipe,
+                    debug = debug,
+                )
+            )
+        else:
+            inferred_edges = tg.infer_relations(
+                pipe,
+                debug = debug,
+            )
+        duration = round(time.time() - start_time, 3)
+        print(f"{duration:7.3f} sec: relation extraction")
+        n_list: list = list(tg.nodes.values())
+        df_rel: pd.DataFrame = pd.DataFrame.from_dict([
+            {
+                "src": n_list[edge.src_node].text,
+                "dst": n_list[edge.dst_node].text,
+                "rel": pipe.kg.normalize_prefix(edge.rel),
+                "weight": edge.prob,
+            }
+            for edge in inferred_edges
+        ])
+        ic(df_rel)
+        ## construct the _lemma graph_
+        start_time = time.time()
+        tg.construct_lemma_graph(
+            debug = debug,
+        )
+        duration = round(time.time() - start_time, 3)
+        print(f"{duration:7.3f} sec: construct graph")
+        ## rank the extracted phrases
+        start_time = time.time()
+        tg.calc_phrase_ranks(
+            pr_alpha = textgraphs.PAGERANK_ALPHA,
+            debug = debug,
+        )
+        duration = round(time.time() - start_time, 3)
+        print(f"{duration:7.3f} sec: rank phrases")
+        ## show the extracted phrase results
+        ic(tg.get_phrases_as_df())
+        if debug:  # pylint: disable=W0101
+            for key, node in tg.nodes.items():
+                print(key, node)
+            for key, edge in tg.edges.items():
+                print(key, edge)
+    except Exception as ex:  # pylint: disable=W0718
+        ic(ex)
+        traceback.print_exc()
+    ## transform graph data to a _graph of relations_
+    start_time = time.time()
+    gor: textgraphs.GraphOfRelations = textgraphs.GraphOfRelations(
+        tg,
+    )
+    gor.seeds(
+        debug = False,  # True
+    )
+    gor.construct_gor(
+        debug = False,  # True
+    )
+    _scores: typing.Dict[ tuple, float ] = gor.get_affinity_scores(
+        debug = False,  # True
+    )
+    duration = round(time.time() - start_time, 3)
+    print(f"{duration:7.3f} sec: graph of relations")
+    gor.render_gor_plt(_scores)
+    plt.show()
+    #sys.exit(0)
+    ######################################################################
+    ## stack profiler report
+    if profile:
+        profiler.stop()
+        profiler.print()
+    ## output lemma graph as JSON
+    with open("lemma.json", "w", encoding = "utf-8") as fp:
+        fp.write(tg.dump_lemma_graph())

docs/abstract.md ADDED Viewed

	@@ -0,0 +1,47 @@

+# Introduction
+**DRAFT** (WIP)
+The primary goal of this project is to improve semi-automated KG construction from large collections of unstructured text sources, while leveraging feedback from domain experts and maintaining quality checks for the aggregated results.
+Typical downstream use cases for these KGs include collecting data for industrial optimization use cases based on _operations research_, as mechanisms enabling structured LLM reasoning [#besta2024topo](biblio.md#besta2024topo), and potentially new methods of integrating KG linked data directly into LLM inference [#wen2023mindmap](biblio.md#wen2023mindmap)
+To this point, this project explores hybrid applications which leverage LLMs to improve _natural language processing_ (NLP) pipeline components, which are also complemented by other deep learning models, graph queries, semantic inference, and related APIs.
+Notably, LLMs come from NLP research.
+Amidst an overwhelming avalanche of contemporary news headlines, pre-print papers, celebrity researchers, industry pundits, and so on ...
+the hype begs a simple question: how good are LLMs at improving the results of natural language parsing and annotation in practice?
+Granted, it is possible to use LLM chat interfaces to generate entire KGs from unstructured text sources.
+Results from this brute-force approach tend to be mixed, especially when KGs rely on non-trivial controlled vocabularies and overlapping concepts.
+For examples, see [#lawrence2024ttg](biblio.md#lawrence2024ttg) and [#nizami2023llm](biblio.md#nizami2023llm).
+Issues with LLM accuracy (e.g., hallucinations) may be partially addressed through use of _retrieval augmented generation_ (RAG).
+Even so, this approach tends to be expensive, especially when large number of PDF documents need to be used as input.
+Use of a fully-automated "black box" based on a LLM chat agent in production use cases also tends to contradict the benefits of curating a KG to collect representations of an organization's domain expertise.
+There are perhaps some deeper issues implied in this work.
+To leverage "generative AI" for KGs, we must cross multiple boundaries of representation.
+For example, graph ML approaches which start from graph-theoretic descriptions are losing vital information.
+On the one hand, these are generally focused on _node prediction_ or _edge prediction_ tasks, which seems overly reductionist and simplistic in the context of trying to generate streams of _composable elements_ for building graphs.
+On the other hand, these approaches typically get trained on _node embeddings_, _edge embeddings_, or _graph embeddings_ -- which may not quite fit the problem at hand.
+Rolling back even further, the transition from NLP parsing of unstructured text sources to the construction of KGs also tends to throw away a lot of potentially useful annotations and context available from the NLP workflows.
+Commonly accepted means for training LLMs from text sources directly often use tokenization which is relatively naïve about what might be structured within the data, other than linear sequences of characters.
+Notably, this ignores the relationships among surface forms of text and their co-occurence with predicted entities or relations.
+Some contemporary approaches to RAG use "chunked" text, attempting to link between chunks, even though this approach arguably destroys information about what is structured within that input data.
+These multiple disconnects between the source data, the representation methods used in training models, and the tactics employed for applications; however, quite arguably the "applications" targeted in research projects generally stop at comparisons of benchmarks.
+Overall, these disconnects indicate the need for rethinking the problem at multiple points.
+For industry uses of KGs, one frequent observation from those leading production projects is that the "last mile" of applications generally relies on _operations research_, not ML.
+We must keep these needs in mind when applying "generative AI" approaches to industry use cases.
+Are we developing representations which can subsequently be leveraged for dynamic programming, convex optimization, etc.?
+This project explores a different definition for "generative AI" in the context of working with KGs for production use cases.
+Rather than pursue an LLM to perform all required tasks, is it possible to combine the use of smaller, more specialized models for specific tasks within the reasonably well-understood process of KG construction?
+In broad strokes, can this work alternative provide counterfactuals to the contemporary trends for chat-based _prompt engineering_?
+Seeking to integrate results from several other research projects implies substantial amounts of code reuse.
+It would be intractable in terms of time and funding to rewrite code and then re-evaluate models for the many research projects which are within the scope of this work.
+Therefore reproducibilty of published results -- based on open source code, models, evals, etc. -- becomes a crucial factor for determining whether others projects are suitable to be adapted into KG workflows.
+For the sake of brevity, we do not define all of the terminology used, instead relying on broadly used terms in the literature.

docs/ack.md ADDED Viewed

	@@ -0,0 +1,11 @@

+# Acknowledgements
+<img src="../assets/nouns/community.png" alt="Community by Aneeque Ahmed from the Noun Project" />
+Contributors:
+  - Jürgen Müller, Zahid Abul-Basher, Nihatha Lathiff, et al., @ BASF
+  - open source sponsors for Derwen.ai
+  - perspectives from the KùzuDB.com team
+  - perspectives from the Argilla.io team
+  - feedback and suggestions from participants at [Dagstuhl Seminar 24061](https://www.dagstuhl.de/24061)

docs/assets/favicon.png ADDED Viewed

docs/assets/hitl.png ADDED Viewed

docs/assets/logo.png ADDED Viewed

docs/assets/nouns/api.png ADDED Viewed

docs/assets/nouns/biblio.png ADDED Viewed

docs/assets/nouns/community.png ADDED Viewed

docs/assets/nouns/concepts.png ADDED Viewed

docs/assets/nouns/discovery.png ADDED Viewed

docs/assets/nouns/evidence.png ADDED Viewed

docs/assets/nouns/feedback.png ADDED Viewed

docs/assets/nouns/howto.png ADDED Viewed

docs/assets/nouns/tutorial.png ADDED Viewed

docs/assets/textgraphs.graffle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2177f30434db8dc6534ed39b3f5a9bed3b0fbd00db26afd841f6e77c788910f2
+size 1410392

docs/biblio.md ADDED Viewed

	@@ -0,0 +1,232 @@

+# Bibliography
+<img src="../assets/nouns/biblio.png" alt="books by b a r z i n from the Noun Project" />
+Where possible, the bibliography entries use conventions at
+<https://www.bibsonomy.org/>
+for [*citation keys*](https://bibdesk.sourceforge.io/manual/BibDeskHelp_2.html).
+Journal abbreviations come from
+<https://academic-accelerator.com/Journal-Abbreviation/System>
+based on [*ISO 4*](https://en.wikipedia.org/wiki/ISO_4) standards.
+Links to online versions of cited works use
+[DOI](https://www.doi.org/)
+for [*persistent identifiers*](https://www.crossref.org/education/metadata/persistent-identifiers/).
+When available,
+[*open access*](https://peerj.com/preprints/3119v1/)
+URLs are listed.
+## – A –
+### aarsen2023ner
+["SpanMarker for Named Entity Recognition"](https://raw.githubusercontent.com/tomaarsen/SpanMarkerNER/main/thesis.pdf)
+**Tom Aarsen**
+*Radboud University* (2023-06-01)
+> A span-level Named Entity Recognition (NER) model that aims to improve performance while reducing computational requirements. SpanMarker leverages special marker tokens and utilizes BERT-style encoders with position IDs and attention mask matrices to capture contextual information effectively.
+### auer07dbpedia
+["DBpedia: A Nucleus for a Web of Open Data"](https://doi.org/10.1007/978-3-540-76298-0_52)
+**Sören Auer**, **Christian Bizer**, **Georgi Kobilarov**, **Jens Lehmann**, **Richard Cyganiak**, **Zachary Ives**
+*ISWC* (2007-11-11)
+> DBpedia is a community effort to extract structured information from Wikipedia and to make this information available on the Web. DBpedia allows you to ask sophisticated queries against datasets derived from Wikipedia and to link other datasets on the Web to Wikipedia data.
+## – B –
+### bachbhg17
+["Hinge-Loss Markov Random Fields and Probabilistic Soft Logic"](https://arxiv.org/abs/1505.04406)
+**Stephen Bach**, **Matthias Broecheler**, **Bert Huang**, **Lise Getoor**
+*JMLR* (2017–11–17)
+> We introduce two new formalisms for modeling structured data, and show that they can both capture rich structure and scale to big data. The first, hinge-loss Markov random fields (HL-MRFs), is a new kind of probabilistic graphical model that generalizes different approaches to convex inference.
+### barrière2016elsf
+["Entities, Labels, and Surface Forms"](https://doi.org/10.1007/978-3-319-41337-2_2)
+**Caroline Barrière**
+_Springer_ (2016-11-19)
+> We will look into a first obstacle toward this seemingly simple IE goal: the fact that entities do not have normalized names. Instead, entities can be referred to by many different surface forms.
+### besta2024topo
+["Topologies of Reasoning: Demystifying Chains, Trees, and Graphs of Thoughts"](https://arxiv.org/abs/2401.14295)
+**Maciej Besta**, **Florim Memedi**, **Zhenyu Zhang**, **Robert Gerstenberger**, **Nils Blach**, **Piotr Nyczyk**, **Marcin Copik**, **Grzegorz Kwasniewski**, **Jurgen Müller**, **Lukas Gianinazzi**, **Ales Kubicek**, **Hubert Niewiadomski**, **Onur Mutlu**, **Torsten Hoefler**
+_ETH Zurich_ (2024-01-25)
+> Introducing a blueprint and an accompanying taxonomy of prompting schemes, focusing on the underlying structure of reasoning.
+## – C –
+### cabot2023redfm
+["RED<sup>FM</sup>: a Filtered and Multilingual Relation Extraction Dataset"](https://arxiv.org/abs/2306.09802)
+**Pere-Lluís Huguet Cabot**, **Simone Tedeschi**, **Axel-Cyrille Ngonga Ngomo**, **Roberto Navigli**
+_ACL_ (2023-06-19)
+> Relation Extraction (RE) is a task that identifies relationships between entities in a text, enabling the acquisition of relational facts and bridging the gap between natural language and structured knowledge. However, current RE models often rely on small datasets with low coverage of relation types, particularly when working with languages other than English. In this paper, we address the above issue and provide two new resources that enable the training and evaluation of multilingual RE systems.
+## – E –
+### erxlebengkmv14
+["Introducing Wikidata to the Linked Data Web"](https://doi.org/10.1007/978-3-319-11964-9_4)
+**Fredo Erxleben**, **Michael Günther**, **Markus Krötzsch**, **Julian Mendez**, **Denny Vrandečić**
+_ISWC_ (2014-10-19)
+> We introduce new RDF exports that connect Wikidata to the Linked Data Web. We explain the data model of Wikidata and discuss its encoding in RDF. Moreover, we introduce several partial exports that provide more selective or simplified views on the data.
+## – F –
+### feng2023kuzu
+["KÙZU Graph Database Management System"](https://www.cidrdb.org/cidr2023/papers/p48-jin.pdf)
+**Xiyang Feng**, **Guodong Jin**, **Ziyi Chen**, **Chang Liu**, **Semih Salihoğlu**
+_CIDR_ (2023-01-08)
+> We present Kùzu, a new GDBMS we are developing at University of Waterloo that aims to integrate state-of-art storage, indexing, and query processing techniques to highly optimize for this feature set.
+## – G –
+### galkin2023ultra
+["Towards Foundation Models for Knowledge Graph Reasoning"](https://arxiv.org/abs/2310.04562)
+**Mikhail Galkin**, **Xinyu Yuan**, **Hesham Mostafa**, **Jian Tang**, **Zhaocheng Zhu**
+preprint (2023–10–06)
+> ULTRA builds relational representations as a function conditioned on their interactions. Such a conditioning strategy allows a pre-trained ULTRA model to inductively generalize to any unseen KG with any relation vocabulary and to be fine-tuned on any graph.
+## – H –
+### hagberg2008
+["Exploring network structure, dynamics, and function using NetworkX"](https://conference.scipy.org/proceedings/SciPy2008/paper_2/)
+**Aric A. Hagberg**, **Daniel A. Schult**, **Pieter J. Swart**
+_SciPy2008_ (2008-08-19)
+> NetworkX is a Python language package for exploration and analysis of networks and network algorithms. The core package provides data structures for representing many types of networks, or graphs, including simple graphs, directed graphs, and graphs with parallel edges and self loops.
+### hahnr88
+["Automatic generation of hypertext knowledge bases"](https://doi.org/10.1145/966861.45429)
+**Udo Hahn**, **Ulrich Reimer**
+_ACM SIGOIS_ 9:2 (1988-04-01)
+> The condensation process transforms the text representation structures resulting from the text parse into a more abstract thematic description of what the text is about, filtering out irrelevant knowledge structures and preserving only the most salient concepts.
+### hamilton2020grl
+[_Graph Representation Learning_](https://www.cs.mcgill.ca/~wlh/grl_book/)
+**William Hamilton**
+Morgan and Claypool (pre-print 2020)
+> A brief but comprehensive introduction to graph representation learning, including methods for embedding graph data, graph neural networks, and deep generative models of graphs.
+### hangyyls19
+["OpenNRE: An Open and Extensible Toolkit for Neural Relation Extraction"](https://doi.org/10.18653/v1/D19-3029)
+**Xu Han**, **Tianyu Gao**, **Yuan Yao**, **Deming Ye**, **Zhiyuan Liu**, **Maosong Sun**
+*EMNLP* (2019-11-03)
+> OpenNRE is an open-source and extensible toolkit that provides a unified framework to implement neural models for relation extraction (RE).
+### hartig14
+["Reconciliation of RDF* and Property Graphs"](https://arxiv.org/abs/1409.3288)
+**Olaf Hartig**
+_CoRR_ (2014-11-14)
+> The document proposes a formalization of the PG model and introduces well-defined transformations between PGs and RDF.
+### honnibal2020spacy
+["spaCy: Industrial-strength Natural Language Processing in Python"](https://doi.org/10.5281/zenodo.1212303)
+**Matthew Honnibal**, **Ines Montani**, **Sofie Van Landeghem**, **Adriane Boyd**
+*Explosion AI* (2016-10-18)
+> spaCy is a library for advanced Natural Language Processing in Python and Cython. It's built on the very latest research, and was designed from day one to be used in real products.
+## – L –
+### lee2023ingram
+["InGram: Inductive Knowledge Graph Embedding via Relation Graphs"](https://arxiv.org/abs/2305.19987)
+**Jaejun Lee**, **Chanyoung Chung**, **Joyce Jiyoung Whang**
+_ICML_ (2023–08–17)
+> In this paper, we propose an INductive knowledge GRAph eMbedding method, InGram, that can generate embeddings of new relations as well as new entities at inference time.
+### loganlpgs19
+["Barack's Wife Hillary: Using Knowledge-Graphs for Fact-Aware Language Modeling"](https://arxiv.org/abs/1906.07241)
+**Robert L. Logan IV**, **Nelson F. Liu**, **Matthew E. Peters**, **Matt Gardner**, **Sameer Singh**
+_ACL_ (2019-06-20)
+> We introduce the knowledge graph language model (KGLM), a neural language model with mechanisms for selecting and copying facts from a knowledge graph that are relevant to the context.
+## – M –
+### martonsv17
+["Formalising openCypher Graph Queries in Relational Algebra"](https://doi.org/10.1007/978-3-319-66917-5_13)
+**József Marton**, **Gábor Szárnyas**, **Dániel Varró**
+_ADBIS_ (2017-08-25)
+> We present a formal specification for openCypher, a high-level declarative graph query language with an ongoing standardisation effort.
+### mihalcea04textrank
+["TextRank: Bringing Order into Text"](https://www.aclweb.org/anthology/W04-3252/)
+**Rada Mihalcea**, **Paul Tarau**
+*EMNLP* pp. 404-411 (2004-07-25)
+> In this paper, the authors introduce TextRank, a graph-based ranking model for text processing, and show how this model can be successfully used in natural language applications.
+## – N –
+### nathan2016ptr
+["PyTextRank, a Python implementation of TextRank for phrase extraction and summarization of text documents"](https://doi.org/10.5281/zenodo.4637885)
+**Paco Nathan**, et al.
+*Derwen* (2016-10-03)
+> Python implementation of TextRank algorithms ("textgraphs") for phrase extraction
+### nathan2023glod
+["Graph Levels of Detail"](https://blog.derwen.ai/graph-levels-of-detail-ea4226abba55)
+**Paco Nathan**
+*Derwen* (2023-11-12)
+> How can we work with graph data in more abstracted, aggregate perspectives? While we can run queries on graph data to compute aggregate measures, we don’t have programmatic means of “zooming out” to consider a large graph the way that one zooms out when using an online map.
+## - Q -
+### qin2023sgr
+["Semantic Random Walk for Graph Representation Learning in Attributed Graphs"](https://arxiv.org/abs/2305.06531)
+**Meng Qin**
+*Hong Kong University of Science and Technology* (2023-05-11)
+> We introduced a novel SGR method to generally formulate the network embedding in attributed graphs as a high-order proximity based embedding task of an auxilairy weighted graph with heterogeneous entities.
+### qin2024irwe
+["IRWE: Inductive Random Walk for Joint Inference of Identity and Position Network Embedding"](https://arxiv.org/abs/2401.00651)
+**Meng Qin**, **Dit-Yan Yeung**
+*Hong Kong University of Science and Technology* (2024-01-01)
+> Since nodes in a community should be densely connected, nodes within the same community are more likely to be reached via RWs compared with those in different communities. Therefore, nodes with similar positions (e.g., in the same community) are highly believed to have similar RW statistics.
+## - R -
+### ramage2009rwt
+["Random walks for text semantic similarity"](https://dl.acm.org/doi/10.5555/1708124.1708131)
+**Daniel Ramage**, **Anna Rafferty**, **Christopher Manning**
+_ACL-IJCNLP_ (2009-09-07)
+> Our algorithm aggregates local relatedness information via a random walk over a graph constructed from an underlying lexical resource. The stationary distribution of the graph walk forms a “semantic signature” that can be compared to another such distribution to get a relatedness score for texts.
+## – W –
+### warmerdam2023pydata
+["Natural Intelligence is All You Need™"](https://youtu.be/C9p7suS-NGk?si=7Ohq3BV654ia2Im4)
+**Vincent Warmerdam**
+*PyData Amsterdam* (2023-09-15)
+> In this talk I will try to show you what might happen if you allow yourself the creative freedom to rethink and reinvent common practices once in a while. As it turns out, in order to do that, natural intelligence is all you need. And we may start needing a lot of it in the near future.
+### wen2023mindmap
+["MindMap: Knowledge Graph Prompting Sparks Graph of Thoughts in Large Language Models"](https://arxiv.org/abs/2308.09729)
+**Yilin Wen**, **Zifeng Wang**, **Jimeng Sun**
+_arXiv_ (2023-08-17)
+> We build a prompting pipeline that endows LLMs with the capability of comprehending KG inputs and inferring with a combined implicit knowledge and the retrieved external knowledge.
+### wolf2020transformers
+["Transformers: State-of-the-Art Natural Language Processing"](https://doi.org/10.18653/v1/2020.emnlp-demos.6)
+**Thomas Wolf**, **Lysandre Debut**, **Victor Sanh**, **Julien Chaumond**, **Clement Delangue**, **Anthony Moi**, **Pierric Cistac**, **Tim Rault**, **Remi Louf**, **Morgan Funtowicz**, **Joe Davison**, **Sam Shleifer**, **Patrick von Platen**, **Clara Ma**, **Yacine Jernite**, **Julien Plu**, **Canwen Xu**, **Teven Le Scao**, **Sylvain Gugger**, **Mariama Drame**, **Quentin Lhoest**, **Alexander Rush**
+*EMNLP* (2020-11-16)
+> The library consists of carefully engineered state-of-the art Transformer architectures under a unified API. Backing this library is a curated collection of pretrained models made by and available for the community.

docs/build.md ADDED Viewed

	@@ -0,0 +1,132 @@

+# Build Instructions
+<img src="../assets/nouns/api.png" alt="API by Adnen Kadri from the Noun Project" />
+!!! note
+    In most cases you won't need to build this package locally.
+Unless you're doing development work on the **textgraphs** library itself,
+simply install based on the instructions in
+["Getting Started"](https://derwen.ai/docs/txg/start/).
+## Setup
+To set up the build environment locally:
+```
+python3 -m venv venv
+source venv/bin/activate
+python3 -m pip install -U pip wheel setuptools
+python3 -m pip install -e .
+python3 -m pip install -r requirements-dev.txt
+```
+We use *pre-commit hooks* based on [`pre-commit`](https://pre-commit.com/)
+and to configure that locally:
+```
+pre-commit install --hook-type pre-commit
+```
+## Test Coverage
+This project uses
+[`pytest`](https://docs.pytest.org/)
+for *unit test* coverage.
+Source for unit tests is in the
+[`tests`](https://github.com/DerwenAI/textgraphs/tree/main/tests)
+subdirectory.
+To run the unit tests:
+```
+python3 -m pytest
+```
+Note that these tests run as part of the CI workflow
+whenever code is updated on the GitHub repo.
+## Online Documentation
+To generate documentation pages, you will also need to download
+[`ChromeDriver`](https://googlechromelabs.github.io/chrome-for-testing/)
+for your version of the `Chrome` browser, saved as `chromedriver` in
+this directory.
+Source for the documentation is in the
+[`docs`](https://github.com/DerwenAI/textgraphs/tree/main/docs)
+subdirectory.
+To build the documentation:
+```
+./bin/nb_md.sh
+./pkg_doc.py docs/ref.md
+mkdocs build
+```
+Then run `./bin/preview.py` and load <http://127.0.0.1:8000/docs/>
+in your browser to preview the generated microsite locally.
+To package the generated microsite for deployment on a
+web server:
+```
+tar cvzf txg.tgz site/
+```
+## Remote Repo Updates
+To update source code repo on GitHub:
+```
+git remote set-url origin https://github.com/DerwenAI/textgraphs.git
+git push
+```
+Create new releases on GitHub then run `git pull` locally prior to
+updating Hugging Face or making a new package release.
+To update source code repo+demo on Hugging Face:
+```
+git remote set-url origin https://huggingface.co/spaces/DerwenAI/textgraphs
+git push
+```
+## Package Release
+To update the [release on PyPi](https://pypi.org/project/textgraphs/):
+```
+./bin/push_pypi.sh
+```
+## Packaging
+Both the spaCy and PyPi teams induce packaging errors since they
+have "opinionated" views which conflict against each other and also
+don't quite follow the [Python packaging standards](https://peps.python.org/pep-0621/).
+Moreover, the various dependencies here use a wide range of approaches
+for model downloads: quite appropriately, the spaCy team does not want
+to package their language models on PyPi.
+However, they don't use more contemporary means of model download,
+such as HF transformers, either -- and that triggers logging problems.
+Overall, logging approaches used by the dependencies here for errors/warnings
+are mostly ad-hoc.
+These three issues (packaging, model downloads, logging) pose a small nightmare
+for managing Python library packaging downstream.
+To that point, this project implements several workarounds so that
+applications can download from PyPi.
+Meanwhile keep watch on developments of the following dependencies,
+if they introduce breaking changes or move toward more standard
+packaging practices:
+  * `spaCy` -- model downloads, logging
+  * `OpenNRE` -- PyPi packaging, logging
+  * HF `transformers` and `tokenizers` -- logging
+  * WikiMedia APIs -- SSL certificate expiry

docs/conclude.md ADDED Viewed

	@@ -0,0 +1,53 @@

+# Conclusions
+**DRAFT** (WIP)
+`TextGraphs` library provides a highly configurable and extensible open source Python library for the integration and evaluation of several LLM components. This has been built with attention to allowing for concurrency and parallelism for high-performance computing on distributed systems.
+TODO:
+  - leverage co-reference
+  - leverage closure constrained by domain/range
+  - general => specific, uncertain => confident
+The state of _relation extraction_ is arguably immature.
+While the papers in this area compare against benchmarks, their training datasets mostly have been built from Wikidata sources, and inferred relations result in _labels_ not IRIs.
+This precludes downstream use of the inferred relations for semantic inference.
+Ultimately, how can better training data be developed -- e.g., for relation extraction -- to improve large models used in constructing/augmenting knowledge graphs?
+## Questions for Follow Up Research
+Many existing projects produce results which are **descriptive, but not computable**.
+However, given recent innovations, such as _DPO_, there appear to be many opportunities for reworking the training datasets used in
+NRE and RE models, following the pattern of `Notus`
+**R1**: we have demonstrated how to leverage LLM components while emphasizing HITL (domain experts) and quality of results
+**R2**: we have suggested areas where investments in data quality
+may provide substantial gains
+One key take-away from this project is that the model deployments are relatively haphazard across a wide spectrum of performance: some of the open source dependencies use efficient frameworks such as Hugging Face `transformers` to load models, while others use ad-hoc approaches which are much less performant.
+Granted, use of LLMs and other deep learning models is expected to increase computational requirements substantially.
+Given the integration of APIs, the compute, memory, and network requirements for running the `TextGraphs` library in product can be quite large.
+Software engineering optimizations can reduce these requirements substantially through use of hardware acceleration, localized services, proxy/caching, and concurrency.
+However, a more effective approach would be to make investments in data quality (training datasets, benchmarks, evals, etc.) for gains within the core technologies used here: NER, RE, etc.
+Data-first iterations on the model dependencies can alleviate much of this problem.
+**R3**: we have proposed a rubric for evaluating/rating ML open source
+w.r.t. production use cases
+This project integrates available open source projects across a wide range of NLP topics.
+Perspectives were gained from evaluating many open source LLM projects related to NLP components, and the state of readiness for their use in production libraries overall.
+Note that reproducibility rates are abysmally low for open source which accompanies machine learning research papers.
+Few project install correctly, and fewer still run without exceptions.
+Even among the better available OSS project for a given research topic (e.g., _graph embeddings_, _relation extraction_) tend to not have been maintained for years. Of the projects which run, few reproduce their published results, and most are oriented toward command-line (CLI) use to prove specific benchmarks claims.
+These tend to be difficult to rework into production-quality libraries, due to concerns about performance, security, licensing, etc.
+As an outcome of this inquiry, this project presents a rubric for evaluating research papers and their associated code, based on reproducibility and eventual usefulness in software implementations.
+The views expressed are those of the authors and do not reflect the official policy or position of the funding organizations.

docs/details.md ADDED Viewed

	@@ -0,0 +1,64 @@

+This project Implements an LLM-augmented `textgraph` algorithm for
+constructing a _lemma graph_ from raw, unstructured text source.
+The `TextGraphs` library is based on work developed by
+[Derwen](https://derwen.ai/graph)
+in 2023 Q2 for customer apps and used in our `Cysoni`
+product.
+This library integrates code from:
+  * [`SpanMarker`](https://github.com/tomaarsen/SpanMarkerNER/)
+  * [`spaCy-DBpedia-Spotlight`](https://github.com/MartinoMensio/spacy-dbpedia-spotlight)
+  * [`REBEL`](https://github.com/Babelscape/rebel)
+  * [`OpenNRE`](https://github.com/thunlp/OpenNRE/)
+  * [`qwikidata`](https://github.com/kensho-technologies/qwikidata)
+  * [`pulp`](https://github.com/coin-or/pulp)
+  * [`spaCy`](https://spacy.io/)
+  * [`HF transformers`](https://huggingface.co/docs/transformers/index)
+  * [`PyTextRank`](https://github.com/DerwenAI/pytextrank/)
+For more background about early efforts which led to this line of inquiry, see the recent talks:
+  * ["Language, Graphs, and AI in Industry"](https://derwen.ai/s/mqqm)
+  **Paco Nathan**, K1st World (2023-10-11)  ([video](https://derwen.ai/s/4h2kswhrm3gc))
+  * ["Language Tools for Creators"](https://derwen.ai/s/rhvg)
+  **Paco Nathan**, FOSSY (2023-07-13)
+The `TextGraphs` library shows integrations of several of these kinds
+of components, complemented with use of graph queries, graph algorithms,
+and other related tooling.
+Admittedly, the results present a "hybrid" approach:
+it's not purely "generative" -- whatever that might mean.
+A core principle here is to provide results from the natural language
+workflows which may be used for expert feedback.
+In other words, how can we support means for leveraging
+_human-in-the-loop_ (HITL) process?
+Another principle has been to create a Python library built to produced
+configurable, extensible pipelines.
+Care has been given to writing code that can be run concurrently
+(e.g., leveraging `asyncio`), using dependencies which have
+business-friendly licenses, and paying attention to security concerns.
+The library provides three main affordances for AI applications:
+  1. With the default settings, one can use `TextGraphs` to extracti ranked key phrases from raw text -- even without using any of the additional deep learning models.
+  2. Going a few further steps, one can generate an RDF or LPG graph from raw texts, and make use of _entity linking_, _relation extraction_, and other techniques to ground the natural language parsing by leveraging some knowledge graph which represents a particular domain. Default examples use WikiMedia graphs: DBPedia, Wikidata, etc.
+  3. A third set of goals for `TextGraphs` is to provide a "playground" or "gym" for evaluating _graph levels of detail_, i.e., abstraction layers for knowledge graphs, and explore some the emerging work to produced _foundation models_ for knowledge graphs through topological transforms.
+Regarding the third point, consider how language parsing produces
+graphs by definition, although NLP results tend to be quite _noisy_.
+The annotations inferred by NLP pipelines often get thrown out.
+This seemed like a good opportunity to generate sample data for
+"condensing" graphs into more abstracted representations.
+In other words, patterns within the relatively noisy parse results
+can be condensed into relatively refined knowledge graph elements.
+Note that while the `spaCy` library for NLP plays a central role, the
+`TextGraphs` library is not intended to become a `spaCy` pipeline.

docs/ex0_0.md ADDED Viewed

	@@ -0,0 +1,689 @@

+!!! note
+    To run this notebook in JupyterLab, load [`examples/ex0_0.ipynb`](https://github.com/DerwenAI/textgraphs/blob/main/examples/ex0_0.ipynb)
+# demo: TextGraphs + LLMs to construct a 'lemma graph'
+_TextGraphs_ library is intended for iterating through a sequence of paragraphs.
+## environment
+```python
+from IPython.display import display, HTML, Image, SVG
+import pathlib
+import typing
+from icecream import ic
+from pyinstrument import Profiler
+import matplotlib.pyplot as plt
+import pandas as pd
+import pyvis
+import spacy
+import textgraphs
+```
+```python
+%load_ext watermark
+```
+```python
+%watermark
+```
+    Last updated: 2024-01-16T17:41:51.229985-08:00
+    Python implementation: CPython
+    Python version       : 3.10.11
+    IPython version      : 8.20.0
+    Compiler    : Clang 13.0.0 (clang-1300.0.29.30)
+    OS          : Darwin
+    Release     : 21.6.0
+    Machine     : x86_64
+    Processor   : i386
+    CPU cores   : 8
+    Architecture: 64bit
+```python
+%watermark --iversions
+```
+    sys       : 3.10.11 (v3.10.11:7d4cc5aa85, Apr  4 2023, 19:05:19) [Clang 13.0.0 (clang-1300.0.29.30)]
+    spacy     : 3.7.2
+    pandas    : 2.1.4
+    matplotlib: 3.8.2
+    textgraphs: 0.5.0
+    pyvis     : 0.3.2
+## parse a document
+provide the source text
+```python
+SRC_TEXT: str = """
+Werner Herzog is a remarkable filmmaker and an intellectual originally from Germany, the son of Dietrich Herzog.
+After the war, Werner fled to America to become famous.
+"""
+```
+set up the statistical stack profiling
+```python
+profiler: Profiler = Profiler()
+profiler.start()
+```
+set up the `TextGraphs` pipeline
+```python
+tg: textgraphs.TextGraphs = textgraphs.TextGraphs(
+    factory = textgraphs.PipelineFactory(
+        spacy_model = textgraphs.SPACY_MODEL,
+        ner = None,
+        kg = textgraphs.KGWikiMedia(
+            spotlight_api = textgraphs.DBPEDIA_SPOTLIGHT_API,
+            dbpedia_search_api = textgraphs.DBPEDIA_SEARCH_API,
+            dbpedia_sparql_api = textgraphs.DBPEDIA_SPARQL_API,
+    		wikidata_api = textgraphs.WIKIDATA_API,
+            min_alias = textgraphs.DBPEDIA_MIN_ALIAS,
+            min_similarity = textgraphs.DBPEDIA_MIN_SIM,
+        ),
+        infer_rels = [
+    		textgraphs.InferRel_OpenNRE(
+                model = textgraphs.OPENNRE_MODEL,
+                max_skip = textgraphs.MAX_SKIP,
+                min_prob = textgraphs.OPENNRE_MIN_PROB,
+    		),
+            textgraphs.InferRel_Rebel(
+                lang = "en_XX",
+                mrebel_model = textgraphs.MREBEL_MODEL,
+            ),
+        ],
+    ),
+)
+pipe: textgraphs.Pipeline = tg.create_pipeline(
+    SRC_TEXT.strip(),
+)
+```
+## visualize the parse results
+```python
+spacy.displacy.render(
+    pipe.ner_doc,
+    style = "ent",
+    jupyter = True,
+)
+```
+<span class="tex2jax_ignore"><div class="entities" style="line-height: 2.5; direction: ltr">
+<mark class="entity" style="background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
+    Werner Herzog
+    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">PERSON</span>
+</mark>
+ is a remarkable filmmaker and an intellectual originally from
+<mark class="entity" style="background: #feca74; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
+    Germany
+    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">GPE</span>
+</mark>
+, the son of
+<mark class="entity" style="background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
+    Dietrich Herzog
+    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">PERSON</span>
+</mark>
+.<br>After the war,
+<mark class="entity" style="background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
+    Werner
+    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">PERSON</span>
+</mark>
+ fled to
+<mark class="entity" style="background: #feca74; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
+    America
+    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">GPE</span>
+</mark>
+ to become famous.</div></span>
+```python
+parse_svg: str = spacy.displacy.render(
+    pipe.ner_doc,
+    style = "dep",
+    jupyter = False,
+)
+display(SVG(parse_svg))
+```
+![svg](ex0_0_files/ex0_0_17_0.svg)
+## collect graph elements from the parse
+```python
+tg.collect_graph_elements(
+    pipe,
+    debug = False,
+)
+```
+```python
+ic(len(tg.nodes.values()));
+ic(len(tg.edges.values()));
+```
+    ic| len(tg.nodes.values()): 36
+    ic| len(tg.edges.values()): 42
+## perform entity linking
+```python
+tg.perform_entity_linking(
+    pipe,
+    debug = False,
+)
+```
+## infer relations
+```python
+inferred_edges: list = await tg.infer_relations_async(
+    pipe,
+    debug = False,
+)
+inferred_edges
+```
+    [Edge(src_node=0, dst_node=10, kind=<RelEnum.INF: 2>, rel='https://schema.org/nationality', prob=1.0, count=1),
+     Edge(src_node=15, dst_node=0, kind=<RelEnum.INF: 2>, rel='https://schema.org/children', prob=1.0, count=1),
+     Edge(src_node=27, dst_node=22, kind=<RelEnum.INF: 2>, rel='https://schema.org/event', prob=1.0, count=1)]
+## construct a lemma graph
+```python
+tg.construct_lemma_graph(
+    debug = False,
+)
+```
+## extract ranked entities
+```python
+tg.calc_phrase_ranks(
+    pr_alpha = textgraphs.PAGERANK_ALPHA,
+    debug = False,
+)
+```
+show the resulting entities extracted from the document
+```python
+df: pd.DataFrame = tg.get_phrases_as_df()
+df
+```
+<div>
+<style scoped>
+    .dataframe tbody tr th:only-of-type {
+        vertical-align: middle;
+    }
+    .dataframe tbody tr th {
+        vertical-align: top;
+    }
+    .dataframe thead th {
+        text-align: right;
+    }
+</style>
+<table border="1" class="dataframe">
+  <thead>
+    <tr style="text-align: right;">
+      <th></th>
+      <th>node_id</th>
+      <th>text</th>
+      <th>pos</th>
+      <th>label</th>
+      <th>count</th>
+      <th>weight</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <th>0</th>
+      <td>0</td>
+      <td>Werner Herzog</td>
+      <td>PROPN</td>
+      <td>dbr:Werner_Herzog</td>
+      <td>1</td>
+      <td>0.080547</td>
+    </tr>
+    <tr>
+      <th>1</th>
+      <td>10</td>
+      <td>Germany</td>
+      <td>PROPN</td>
+      <td>dbr:Germany</td>
+      <td>1</td>
+      <td>0.080437</td>
+    </tr>
+    <tr>
+      <th>2</th>
+      <td>15</td>
+      <td>Dietrich Herzog</td>
+      <td>PROPN</td>
+      <td>dbo:Person</td>
+      <td>1</td>
+      <td>0.079048</td>
+    </tr>
+    <tr>
+      <th>3</th>
+      <td>27</td>
+      <td>America</td>
+      <td>PROPN</td>
+      <td>dbr:United_States</td>
+      <td>1</td>
+      <td>0.079048</td>
+    </tr>
+    <tr>
+      <th>4</th>
+      <td>24</td>
+      <td>Werner</td>
+      <td>PROPN</td>
+      <td>dbo:Person</td>
+      <td>1</td>
+      <td>0.077633</td>
+    </tr>
+    <tr>
+      <th>5</th>
+      <td>4</td>
+      <td>filmmaker</td>
+      <td>NOUN</td>
+      <td>owl:Thing</td>
+      <td>1</td>
+      <td>0.076309</td>
+    </tr>
+    <tr>
+      <th>6</th>
+      <td>22</td>
+      <td>war</td>
+      <td>NOUN</td>
+      <td>owl:Thing</td>
+      <td>1</td>
+      <td>0.076309</td>
+    </tr>
+    <tr>
+      <th>7</th>
+      <td>32</td>
+      <td>a remarkable filmmaker</td>
+      <td>noun_chunk</td>
+      <td>None</td>
+      <td>1</td>
+      <td>0.076077</td>
+    </tr>
+    <tr>
+      <th>8</th>
+      <td>7</td>
+      <td>intellectual</td>
+      <td>NOUN</td>
+      <td>owl:Thing</td>
+      <td>1</td>
+      <td>0.074725</td>
+    </tr>
+    <tr>
+      <th>9</th>
+      <td>13</td>
+      <td>son</td>
+      <td>NOUN</td>
+      <td>owl:Thing</td>
+      <td>1</td>
+      <td>0.074725</td>
+    </tr>
+    <tr>
+      <th>10</th>
+      <td>33</td>
+      <td>an intellectual</td>
+      <td>noun_chunk</td>
+      <td>None</td>
+      <td>1</td>
+      <td>0.074606</td>
+    </tr>
+    <tr>
+      <th>11</th>
+      <td>34</td>
+      <td>the son</td>
+      <td>noun_chunk</td>
+      <td>None</td>
+      <td>1</td>
+      <td>0.074606</td>
+    </tr>
+    <tr>
+      <th>12</th>
+      <td>35</td>
+      <td>the war</td>
+      <td>noun_chunk</td>
+      <td>None</td>
+      <td>1</td>
+      <td>0.074606</td>
+    </tr>
+  </tbody>
+</table>
+</div>
+## visualize the lemma graph
+```python
+render: textgraphs.RenderPyVis = tg.create_render()
+pv_graph: pyvis.network.Network = render.render_lemma_graph(
+    debug = False,
+)
+```
+initialize the layout parameters
+```python
+pv_graph.force_atlas_2based(
+    gravity = -38,
+    central_gravity = 0.01,
+    spring_length = 231,
+    spring_strength = 0.7,
+    damping = 0.8,
+    overlap = 0,
+)
+pv_graph.show_buttons(filter_ = [ "physics" ])
+pv_graph.toggle_physics(True)
+```
+```python
+pv_graph.prep_notebook()
+pv_graph.show("tmp.fig01.html")
+```
+    tmp.fig01.html
+![png](ex0_0_files/tmp.fig01.png)
+## generate a word cloud
+```python
+wordcloud = render.generate_wordcloud()
+display(wordcloud.to_image())
+```
+![png](ex0_0_files/ex0_0_37_0.png)
+## cluster communities in the lemma graph
+In the tutorial
+<a href="https://towardsdatascience.com/how-to-convert-any-text-into-a-graph-of-concepts-110844f22a1a" target="_blank">"How to Convert Any Text Into a Graph of Concepts"</a>,
+Rahul Nayak uses the
+<a href="https://en.wikipedia.org/wiki/Girvan%E2%80%93Newman_algorithm"><em>girvan-newman</em></a>
+algorithm to split the graph into communities, then clusters on those communities.
+His approach works well for unsupervised clustering of key phrases which have been extracted from many documents.
+In contrast, Nayak was working with entities extracted from "chunks" of text, not with a text graph.
+```python
+render.draw_communities();
+```
+![png](ex0_0_files/ex0_0_40_0.png)
+## graph of relations transform
+Show a transformed graph, based on _graph of relations_ (see: `lee2023ingram`)
+```python
+graph: textgraphs.GraphOfRelations = textgraphs.GraphOfRelations(
+    tg
+)
+graph.seeds()
+graph.construct_gor()
+```
+```python
+scores: typing.Dict[ tuple, float ] = graph.get_affinity_scores()
+pv_graph: pyvis.network.Network = graph.render_gor_pyvis(scores)
+pv_graph.force_atlas_2based(
+    gravity = -38,
+    central_gravity = 0.01,
+    spring_length = 231,
+    spring_strength = 0.7,
+    damping = 0.8,
+    overlap = 0,
+)
+pv_graph.show_buttons(filter_ = [ "physics" ])
+pv_graph.toggle_physics(True)
+pv_graph.prep_notebook()
+pv_graph.show("tmp.fig02.html")
+```
+    tmp.fig02.html
+![png](ex0_0_files/tmp.fig02.png)
+*What does this transform provide?*
+By using a _graph of relations_ dual representation of our graph data, first and foremost we obtain a more compact representation of the relations in the graph, and means of making inferences (e.g., _link prediction_) where there is substantially more invariance in the training data.
+Also recognize that for a parse graph of a paragraph in the English language, the most interesting nodes will probably be either subjects (`nsubj`) or direct objects (`pobj`). Here in the _graph of relations_ we see illustrated how the important details from _entity linking_ tend to cluster near either `nsubj` or `pobj` entities, connected through punctuation. This is not as readily observed in the earlier visualization of the _lemma graph_.
+## extract as RDF triples
+Extract the nodes and edges which have IRIs, to create an "abstraction layer" as a semantic graph at a higher level of detail above the _lemma graph_:
+```python
+triples: str = tg.export_rdf()
+print(triples)
+```
+    @base <https://github.com/DerwenAI/textgraphs/ns/> .
+    @prefix dbo: <http://dbpedia.org/ontology/> .
+    @prefix dbr: <http://dbpedia.org/resource/> .
+    @prefix schema: <https://schema.org/> .
+    @prefix skos: <http://www.w3.org/2004/02/skos/core#> .
+    @prefix wd_ent: <http://www.wikidata.org/entity/> .
+    dbr:Germany skos:definition "Germany (German: Deutschland, German pronunciation: [ˈdɔʏtʃlant]), constitutionally the Federal"@en ;
+        skos:prefLabel "Germany"@en .
+    dbr:United_States skos:definition "The United States of America (USA), commonly known as the United States (U.S. or US) or America"@en ;
+        skos:prefLabel "United States"@en .
+    dbr:Werner_Herzog skos:definition "Werner Herzog (German: [ˈvɛɐ̯nɐ ˈhɛɐ̯tsoːk]; born 5 September 1942) is a German film director"@en ;
+        skos:prefLabel "Werner Herzog"@en .
+    wd_ent:Q183 skos:definition "country in Central Europe"@en ;
+        skos:prefLabel "Germany"@en .
+    wd_ent:Q44131 skos:definition "German film director, producer, screenwriter, actor and opera director"@en ;
+        skos:prefLabel "Werner Herzog"@en .
+    <entity/america_PROPN> a dbo:Country ;
+        skos:prefLabel "America"@en ;
+        schema:event <entity/war_NOUN> .
+    <entity/dietrich_PROPN_herzog_PROPN> a dbo:Person ;
+        skos:prefLabel "Dietrich Herzog"@en ;
+        schema:children <entity/werner_PROPN_herzog_PROPN> .
+    <entity/filmmaker_NOUN> skos:prefLabel "filmmaker"@en .
+    <entity/intellectual_NOUN> skos:prefLabel "intellectual"@en .
+    <entity/son_NOUN> skos:prefLabel "son"@en .
+    <entity/werner_PROPN> a dbo:Person ;
+        skos:prefLabel "Werner"@en .
+    <entity/germany_PROPN> a dbo:Country ;
+        skos:prefLabel "Germany"@en .
+    <entity/war_NOUN> skos:prefLabel "war"@en .
+    <entity/werner_PROPN_herzog_PROPN> a dbo:Person ;
+        skos:prefLabel "Werner Herzog"@en ;
+        schema:nationality <entity/germany_PROPN> .
+    dbo:Country skos:definition "Countries, cities, states"@en ;
+        skos:prefLabel "country"@en .
+    dbo:Person skos:definition "People, including fictional"@en ;
+        skos:prefLabel "person"@en .
+## statistical stack profile instrumentation
+```python
+profiler.stop()
+```
+    <pyinstrument.session.Session at 0x141446080>
+```python
+profiler.print()
+```
+      _     ._   __/__   _ _  _  _ _/_   Recorded: 17:41:51  Samples:  11163
+     /_//_/// /_\ / //_// / //_'/ //     Duration: 57.137    CPU time: 72.235
+    /   _/                      v4.6.1
+    Program: /Users/paco/src/textgraphs/venv/lib/python3.10/site-packages/ipykernel_launcher.py -f /Users/paco/Library/Jupyter/runtime/kernel-8ffadb7d-3b45-4e0e-a94f-f098e5ad9fbe.json
+    57.136 _UnixSelectorEventLoop._run_once  asyncio/base_events.py:1832
+    └─ 57.135 Handle._run  asyncio/events.py:78
+          [12 frames hidden]  asyncio, ipykernel, IPython
+             41.912 ZMQInteractiveShell.run_ast_nodes  IPython/core/interactiveshell.py:3394
+             ├─ 20.701 <module>  ../ipykernel_5151/1245857438.py:1
+             │  └─ 20.701 TextGraphs.perform_entity_linking  textgraphs/doc.py:534
+             │     └─ 20.701 KGWikiMedia.perform_entity_linking  textgraphs/kg.py:306
+             │        ├─ 10.790 KGWikiMedia._link_kg_search_entities  textgraphs/kg.py:932
+             │        │  └─ 10.787 KGWikiMedia.dbpedia_search_entity  textgraphs/kg.py:641
+             │        │     └─ 10.711 get  requests/api.py:62
+             │        │           [37 frames hidden]  requests, urllib3, http, socket, ssl,...
+             │        ├─ 9.143 KGWikiMedia._link_spotlight_entities  textgraphs/kg.py:851
+             │        │  └─ 9.140 KGWikiMedia.dbpedia_search_entity  textgraphs/kg.py:641
+             │        │     └─ 9.095 get  requests/api.py:62
+             │        │           [37 frames hidden]  requests, urllib3, http, socket, ssl,...
+             │        └─ 0.768 KGWikiMedia._secondary_entity_linking  textgraphs/kg.py:1060
+             │           └─ 0.768 KGWikiMedia.wikidata_search  textgraphs/kg.py:575
+             │              └─ 0.765 KGWikiMedia._wikidata_endpoint  textgraphs/kg.py:444
+             │                 └─ 0.765 get  requests/api.py:62
+             │                       [7 frames hidden]  requests, urllib3
+             └─ 19.514 <module>  ../ipykernel_5151/1708547378.py:1
+                ├─ 14.502 InferRel_Rebel.__init__  textgraphs/rel.py:121
+                │  └─ 14.338 pipeline  transformers/pipelines/__init__.py:531
+                │        [39 frames hidden]  transformers, torch, <built-in>, json
+                ├─ 3.437 PipelineFactory.__init__  textgraphs/pipe.py:434
+                │  └─ 3.420 load  spacy/__init__.py:27
+                │        [20 frames hidden]  spacy, en_core_web_sm, catalogue, imp...
+                ├─ 0.900 InferRel_OpenNRE.__init__  textgraphs/rel.py:33
+                │  └─ 0.888 get_model  opennre/pretrain.py:126
+                └─ 0.672 TextGraphs.create_pipeline  textgraphs/doc.py:103
+                   └─ 0.672 PipelineFactory.create_pipeline  textgraphs/pipe.py:508
+                      └─ 0.672 Pipeline.__init__  textgraphs/pipe.py:216
+                         └─ 0.672 English.__call__  spacy/language.py:1016
+                               [11 frames hidden]  spacy, spacy_dbpedia_spotlight, reque...
+             14.363 InferRel_Rebel.gen_triples_async  textgraphs/pipe.py:188
+             ├─ 13.670 InferRel_Rebel.gen_triples  textgraphs/rel.py:259
+             │  ├─ 12.439 InferRel_Rebel.tokenize_sent  textgraphs/rel.py:145
+             │  │  └─ 12.436 TranslationPipeline.__call__  transformers/pipelines/text2text_generation.py:341
+             │  │        [42 frames hidden]  transformers, torch, <built-in>
+             │  └─ 1.231 KGWikiMedia.resolve_rel_iri  textgraphs/kg.py:370
+             │     └─ 0.753 get_entity_dict_from_api  qwikidata/linked_data_interface.py:21
+             │           [8 frames hidden]  qwikidata, requests, urllib3
+             └─ 0.693 InferRel_OpenNRE.gen_triples  textgraphs/rel.py:58
+## outro
+_\[ more parts are in progress, getting added to this demo \]_

docs/ex0_0_files/ex0_0_17_0.svg ADDED Viewed

docs/ex0_0_files/ex0_0_37_0.jpg ADDED Viewed

docs/ex0_0_files/ex0_0_37_0.png ADDED Viewed

docs/ex0_0_files/ex0_0_39_0.jpg ADDED Viewed

docs/ex0_0_files/ex0_0_39_0.png ADDED Viewed

docs/ex0_0_files/ex0_0_40_0.png ADDED Viewed

docs/ex0_0_files/ex0_0_42_0.png ADDED Viewed

docs/ex0_0_files/tmp.fig01.png ADDED Viewed

docs/ex0_0_files/tmp.fig02.png ADDED Viewed

docs/ex1_0.md ADDED Viewed

	@@ -0,0 +1,776 @@

+!!! note
+    To run this notebook in JupyterLab, load [`examples/ex1_0.ipynb`](https://github.com/DerwenAI/textgraphs/blob/main/examples/ex1_0.ipynb)
+# reproduce results from the "InGram" paper
+This is an attempt to reproduce the _graph of relations_ example given in `lee2023ingram`
+## environment
+```python
+import os
+import pathlib
+import typing
+from icecream import ic
+from pyinstrument import Profiler
+import matplotlib.pyplot as plt
+import pandas as pd
+import pyvis
+import textgraphs
+```
+```python
+%load_ext watermark
+```
+```python
+%watermark
+```
+    Last updated: 2024-01-16T17:35:45.550539-08:00
+    Python implementation: CPython
+    Python version       : 3.10.11
+    IPython version      : 8.20.0
+    Compiler    : Clang 13.0.0 (clang-1300.0.29.30)
+    OS          : Darwin
+    Release     : 21.6.0
+    Machine     : x86_64
+    Processor   : i386
+    CPU cores   : 8
+    Architecture: 64bit
+```python
+%watermark --iversions
+```
+    matplotlib: 3.8.2
+    pandas    : 2.1.4
+    pyvis     : 0.3.2
+    textgraphs: 0.5.0
+    sys       : 3.10.11 (v3.10.11:7d4cc5aa85, Apr  4 2023, 19:05:19) [Clang 13.0.0 (clang-1300.0.29.30)]
+## load example graph
+load from a JSON file which replicates the data for the "Figure 3" example
+```python
+graph: textgraphs.GraphOfRelations = textgraphs.GraphOfRelations(
+    textgraphs.SimpleGraph()
+)
+ingram_path: pathlib.Path = pathlib.Path(os.getcwd()) / "ingram.json"
+graph.load_ingram(
+    ingram_path,
+    debug = False,
+)
+```
+set up the statistical stack profiling
+```python
+profiler: Profiler = Profiler()
+profiler.start()
+```
+## decouple graph edges into "seeds"
+```python
+graph.seeds(
+    debug = True,
+)
+```
+    --- triples in source graph ---
+    ic| edge.src_node: 0, rel_id: 1, edge.dst_node: 1
+    ic| edge.src_node: 0, rel_id: 0, edge.dst_node: 2
+    ic| edge.src_node: 0, rel_id: 0, edge.dst_node: 3
+    ic| edge.src_node: 4, rel_id: 2, edge.dst_node: 2
+    ic| edge.src_node: 4, rel_id: 2, edge.dst_node: 3
+    ic| edge.src_node: 4, rel_id: 1, edge.dst_node: 5
+    ic| edge.src_node: 6, rel_id: 1, edge.dst_node: 5
+    ic| edge.src_node: 6, rel_id: 2, edge.dst_node: 7
+    ic| edge.src_node: 6, rel_id: 4, edge.dst_node: 8
+    ic| edge.src_node: 9,
+     Steven_Spielberg Profession Director
+     Steven_Spielberg Directed Catch_Me_If_Can
+     Steven_Spielberg Directed Saving_Private_Ryan
+     Tom_Hanks ActedIn Catch_Me_If_Can
+     Tom_Hanks ActedIn Saving_Private_Ryan
+     Tom_Hanks Profession Actor
+     Mark_Hamil Profession Actor
+     Mark_Hamil ActedIn Star_Wars
+     Mark_Hamil BornIn California
+    rel_id: 5, edge.dst_node: 10
+    ic| edge.src_node: 9, rel_id: 4, edge.dst_node: 10
+    ic| edge.src_node: 9, rel_id: 3, edge.dst_node: 8
+    ic| edge.src_node: 11, rel_id: 4, edge.dst_node: 12
+    ic| edge.src_node: 11, rel_id: 3, edge.dst_node: 12
+    ic| edge.src_node: 11, rel_id: 3, edge.dst_node: 8
+     Brad_Pitt Nationality USA
+     Brad_Pitt BornIn USA
+     Brad_Pitt LivedIn California
+     Clint_Eastwood BornIn San_Francisco
+     Clint_Eastwood LivedIn San_Francisco
+     Clint_Eastwood LivedIn California
+```python
+graph.trace_source_graph()
+```
+    --- nodes in source graph ---
+    n:  0, Steven_Spielberg
+     head: []
+     tail: [(0, 'Profession', 1), (0, 'Directed', 2), (0, 'Directed', 3)]
+    n:  1, Director
+     head: [(0, 'Profession', 1)]
+     tail: []
+    n:  2, Catch_Me_If_Can
+     head: [(0, 'Directed', 2), (4, 'ActedIn', 2)]
+     tail: []
+    n:  3, Saving_Private_Ryan
+     head: [(0, 'Directed', 3), (4, 'ActedIn', 3)]
+     tail: []
+    n:  4, Tom_Hanks
+     head: []
+     tail: [(4, 'ActedIn', 2), (4, 'ActedIn', 3), (4, 'Profession', 5)]
+    n:  5, Actor
+     head: [(4, 'Profession', 5), (6, 'Profession', 5)]
+     tail: []
+    n:  6, Mark_Hamil
+     head: []
+     tail: [(6, 'Profession', 5), (6, 'ActedIn', 7), (6, 'BornIn', 8)]
+    n:  7, Star_Wars
+     head: [(6, 'ActedIn', 7)]
+     tail: []
+    n:  8, California
+     head: [(6, 'BornIn', 8), (9, 'LivedIn', 8), (11, 'LivedIn', 8)]
+     tail: []
+    n:  9, Brad_Pitt
+     head: []
+     tail: [(9, 'Nationality', 10), (9, 'BornIn', 10), (9, 'LivedIn', 8)]
+    n: 10, USA
+     head: [(9, 'Nationality', 10), (9, 'BornIn', 10)]
+     tail: []
+    n: 11, Clint_Eastwood
+     head: []
+     tail: [(11, 'BornIn', 12), (11, 'LivedIn', 12), (11, 'LivedIn', 8)]
+    n: 12, San_Francisco
+     head: [(11, 'BornIn', 12), (11, 'LivedIn', 12)]
+     tail: []
+    --- edges in source graph ---
+    e:  0, Directed
+    e:  1, Profession
+    e:  2, ActedIn
+    e:  3, LivedIn
+    e:  4, BornIn
+    e:  5, Nationality
+## construct a _graph of relations_
+Transform the graph data into _graph of relations_
+```python
+graph.construct_gor(
+	debug = True,
+)
+```
+    ic| node_id: 0, len(seeds
+    --- transformed triples ---
+    ): 3
+    ic| trans_arc: TransArc(pair_key=(0, 1),
+                            a_rel=1,
+                            b_rel=0,
+                            node_id=0,
+                            a_dir=<RelDir.TAIL: 1>,
+                            b_dir=<RelDir.TAIL: 1>)
+    ic| trans_arc: TransArc(pair_key=(0, 1),
+                            a_rel=1,
+                            b_rel=0,
+                            node_id=0,
+                            a_dir=<RelDir.TAIL: 1>,
+                            b_dir=<RelDir.TAIL: 1>)
+    ic| trans_arc: TransArc(pair_key=(0, 0),
+                            a_rel=0,
+                            b_rel=0,
+                            node_id=0,
+                            a_dir=<RelDir
+     (0, 1) Profession.tail Steven_Spielberg Directed.tail
+     (0, 1) Profession.tail Steven_Spielberg Directed.tail
+     (0, 0) Directed.tail Steven_Spielberg Directed.tail
+    .TAIL: 1>,
+                            b_dir=<RelDir.TAIL: 1>)
+    ic| node_id: 1, len(seeds
+    ): 1
+    ic| node_id: 2, len(seeds): 2
+    ic| trans_arc: TransArc(pair_key=(0, 2),
+                            a_rel=0,
+                            b_rel=2,
+                            node_id=2,
+                            a_dir=<RelDir.HEAD: 0>,
+                            b_dir=<
+     (0, 2) Directed.head Catch_Me_If_Can ActedIn.head
+    RelDir.HEAD: 0>)
+    ic| node_id: 3, len(seeds): 2
+    ic| trans_arc: TransArc(pair_key=(0, 2),
+                            a_rel=0,
+                            b_rel=2,
+                            node_id=3,
+                            a_dir=<RelDir.HEAD: 0>,
+                            b_dir=<RelDir.HEAD: 0>)
+    ic| node_id
+     (0, 2) Directed.head Saving_Private_Ryan ActedIn.head
+    : 4, len(seeds): 3
+    ic| trans_arc: TransArc(pair_key=(2, 2),
+                            a_rel=2,
+                            b_rel=2,
+                            node_id=4,
+                            a_dir=<RelDir.TAIL: 1>,
+                            b_dir=<RelDir.TAIL: 1>)
+    ic| trans_arc: TransArc(pair_key=(1, 2),
+                            a_rel=2,
+                            b_rel=1,
+                            node_id=4,
+                            a_dir=<RelDir.TAIL: 1>,
+                            b_dir=<RelDir.TAIL: 1>)
+    ic| trans_arc: TransArc(pair_key=(1, 2)
+     (2, 2) ActedIn.tail Tom_Hanks ActedIn.tail
+     (1, 2) ActedIn.tail Tom_Hanks Profession.tail
+     (1, 2) ActedIn.tail Tom_Hanks Profession.tail
+    ,
+                            a_rel=2,
+                            b_rel=1,
+                            node_id=4,
+                            a_dir=<RelDir.TAIL: 1>,
+                            b_dir=<RelDir.TAIL: 1>)
+    ic|
+     node_id: 5, len(seeds): 2
+    ic| trans_arc: TransArc(pair_key=(1, 1),
+                            a_rel=1,
+                            b_rel=1,
+     (1, 1) Profession.head Actor Profession.head
+    node_id=5,
+                            a_dir=<RelDir.HEAD: 0>,
+                            b_dir=<RelDir.HEAD: 0>)
+    ic| node_id: 6, len(seeds): 3
+    ic| trans_arc: TransArc(pair_key=(1, 2),
+                            a_rel=1,
+                            b_rel=2,
+                            node_id=6,
+                            a_dir=<RelDir.TAIL: 1>,
+                            b_dir=<RelDir.TAIL:
+     (1, 2) Profession.tail Mark_Hamil ActedIn.tail
+    1>)
+    ic| trans_arc: TransArc(pair_key=(1, 4),
+                            a_rel=1,
+                            b_rel=4,
+                            node_id=6,
+                            a_dir
+     (1, 4) Profession.tail Mark_Hamil BornIn.tail
+    =<RelDir.TAIL: 1>,
+                            b_dir=<RelDir.TAIL: 1>)
+    ic| trans_arc: TransArc(pair_key=(2, 4),
+                            a_rel=2,
+                            b_rel=4,
+                            node_id=6,
+     (2, 4) ActedIn.tail Mark_Hamil BornIn.tail
+                            a_dir=<RelDir.TAIL: 1>,
+                            b_dir=<RelDir.TAIL: 1>)
+    ic| node_id: 7, len(seeds): 1
+    ic| node_id: 8, len(seeds): 3
+    ic| trans_arc: TransArc(pair_key=(3, 4),
+                            a_rel=4,
+                            b_rel=3,
+                            node_id=8,
+                            a_dir=<RelDir.HEAD: 0>,
+                            b_dir=<RelDir.HEAD:
+     (3, 4) BornIn.head California LivedIn.head
+     0>)
+    ic| trans_arc: TransArc(pair_key=(3, 4),
+                            a_rel=4,
+                            b_rel=3,
+                            node_id=8,
+                            a_dir=<RelDir.HEAD: 0>,
+                            b_dir=<RelDir.HEAD: 0>)
+    ic| trans_arc: TransArc(pair_key=(3, 3),
+                            a_rel=3,
+                            b_rel=3,
+                            node_id=8,
+                            a_dir=<RelDir.HEAD: 0>,
+                            b_dir=<RelDir.HEAD: 0>)
+    ic| node_id: 9, len(seeds): 3
+    ic
+     (3, 4) BornIn.head California LivedIn.head
+     (3, 3) LivedIn.head California LivedIn.head
+     (4, 5) Nationality.tail Brad_Pitt BornIn.tail
+    | trans_arc: TransArc(pair_key=(4, 5),
+                            a_rel=5,
+                            b_rel=4,
+                            node_id=9,
+                            a_dir=<RelDir.TAIL: 1>,
+                            b_dir=<RelDir.TAIL: 1>)
+    ic| trans_arc: TransArc(pair_key=(3, 5),
+                            a_rel=5,
+                            b_rel=3,
+                            node_id=9,
+                            a_dir=<RelDir.TAIL: 1>,
+                            b_dir=<
+     (3, 5) Nationality.tail Brad_Pitt LivedIn.tail
+    RelDir.TAIL: 1>)
+    ic| trans_arc: TransArc(pair_key=(3, 4),
+                            a_rel=4,
+                            b_rel=3,
+                            node_id=9,
+                            a_dir=<RelDir.TAIL: 1>,
+                            b_dir=<RelDir.TAIL: 1>)
+    ic| node_id: 10, len(seeds): 2
+    ic| trans_arc: TransArc(pair_key=(4, 5),
+                            a_rel=5,
+                            b_rel=4,
+                            node_id=10,
+                            a_dir=<RelDir.HEAD: 0>,
+                            b_dir=<RelDir.HEAD: 0>)
+    ic| node_id: 11, len(seeds): 3
+    ic| trans_arc: TransArc(pair_key=(3,
+     (3, 4) BornIn.tail Brad_Pitt LivedIn.tail
+     (4, 5) Nationality.head USA BornIn.head
+     (3, 4) BornIn.tail Clint_Eastwood LivedIn.tail
+    4),
+                            a_rel=4,
+                            b_rel=3,
+                            node_id=11,
+                            a_dir=<RelDir.TAIL: 1>,
+                            b_dir=<RelDir.TAIL: 1>)
+    ic
+     (3, 4) BornIn.tail Clint_Eastwood LivedIn.tail
+    | trans_arc: TransArc(pair_key=(3, 4),
+                            a_rel=4,
+                            b_rel=3,
+                            node_id=11,
+                            a_dir=<RelDir.TAIL: 1>,
+                            b_dir=<RelDir.TAIL: 1>)
+    ic| trans_arc: TransArc(pair_key=(3, 3),
+                            a_rel=3,
+                            b_rel=3,
+                            node_id=11,
+                            a_dir=<RelDir.TAIL: 1>,
+                            b_dir=<RelDir.TAIL: 1>)
+    ic| node_id: 12, len(seeds
+     (3, 3) LivedIn.tail Clint_Eastwood LivedIn.tail
+    ): 2
+    ic| trans_arc: TransArc(pair_key=(3, 4),
+                            a_rel=4,
+                            b_rel=3,
+                            node_id=12,
+                            a_dir=<RelDir.HEAD: 0>,
+                            b_dir=<RelDir.HEAD: 0>)
+     (3, 4) BornIn.head San_Francisco LivedIn.head
+```python
+scores: typing.Dict[ tuple, float ] = graph.get_affinity_scores(
+    debug = True,
+)
+```
+    --- collect shared entity tallies ---
+    0 Directed
+     h: 4 dict_items([(2, 4.0)])
+     t: 6 dict_items([(0, 3.0), (1, 3.0)])
+    1 Profession
+     h: 3 dict_items([(1, 3.0)])
+     t: 10 dict_items([(0, 3.0), (2, 5.0), (4, 2.0)])
+    2 ActedIn
+     h: 4 dict_items([(0, 4.0)])
+     t: 10 dict_items([(1, 5.0), (2, 3.0), (4, 2.0)])
+    3 LivedIn
+     h: 8 dict_items([(3, 3.0), (4, 5.0)])
+     t: 10 dict_items([(3, 3.0), (4, 5.0), (5, 2.0)])
+    4 BornIn
+     h: 7 dict_items([(3, 5.0), (5, 2.0)])
+     t: 11 dict_items([(1, 2.0), (2, 2.0), (3, 5.0), (5, 2.0)])
+    5 Nationality
+     h: 2 dict_items([(4, 2.0)])
+     t: 4 dict_items([(3, 2.0), (4, 2.0)])
+```python
+ic(scores);
+```
+    ic| scores: {(0, 0): 0.3,
+                 (0, 1): 0.2653846153846154,
+                 (0, 2): 0.34285714285714286,
+                 (1, 1): 0.23076923076923078,
+                 (1, 2): 0.3708791208791209,
+                 (1, 4): 0.13247863247863248,
+                 (2, 2): 0.21428571428571427,
+                 (2, 4): 0.12698412698412698,
+                 (3, 3): 0.3333333333333333,
+                 (3, 4): 0.5555555555555556,
+                 (3, 5): 0.2222222222222222,
+                 (4, 5): 0.4444444444444444}
+## visualize the transform results
+```python
+graph.render_gor_plt(scores)
+plt.show()
+```
+![png](ex1_0_files/ex1_0_22_0.png)
+```python
+pv_graph: pyvis.network.Network = graph.render_gor_pyvis(scores)
+pv_graph.force_atlas_2based(
+    gravity = -38,
+    central_gravity = 0.01,
+    spring_length = 231,
+    spring_strength = 0.7,
+    damping = 0.8,
+    overlap = 0,
+)
+pv_graph.show_buttons(filter_ = [ "physics" ])
+pv_graph.toggle_physics(True)
+pv_graph.prep_notebook()
+pv_graph.show("tmp.fig03.html")
+```
+    tmp.fig03.html
+![png](ex1_0_files/tmp.fig03.png)
+## analysis
+As the results below above illustrate, the computed _affinity scores_ differ from what is published in `lee2023ingram`. After trying several different variations of interpretation for the paper's descriptions, the current approach provides the closest approximation that we have obtained.
+```python
+df: pd.DataFrame = graph.trace_metrics(scores)
+df
+```
+<div>
+<style scoped>
+    .dataframe tbody tr th:only-of-type {
+        vertical-align: middle;
+    }
+    .dataframe tbody tr th {
+        vertical-align: top;
+    }
+    .dataframe thead th {
+        text-align: right;
+    }
+</style>
+<table border="1" class="dataframe">
+  <thead>
+    <tr style="text-align: right;">
+      <th></th>
+      <th>pair</th>
+      <th>rel_a</th>
+      <th>rel_b</th>
+      <th>affinity</th>
+      <th>expected</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <th>0</th>
+      <td>(0, 0)</td>
+      <td>Directed</td>
+      <td>Directed</td>
+      <td>0.30</td>
+      <td>NaN</td>
+    </tr>
+    <tr>
+      <th>1</th>
+      <td>(0, 1)</td>
+      <td>Directed</td>
+      <td>Profession</td>
+      <td>0.27</td>
+      <td>0.22</td>
+    </tr>
+    <tr>
+      <th>2</th>
+      <td>(0, 2)</td>
+      <td>Directed</td>
+      <td>ActedIn</td>
+      <td>0.34</td>
+      <td>0.50</td>
+    </tr>
+    <tr>
+      <th>3</th>
+      <td>(1, 1)</td>
+      <td>Profession</td>
+      <td>Profession</td>
+      <td>0.23</td>
+      <td>NaN</td>
+    </tr>
+    <tr>
+      <th>4</th>
+      <td>(1, 2)</td>
+      <td>Profession</td>
+      <td>ActedIn</td>
+      <td>0.37</td>
+      <td>0.33</td>
+    </tr>
+    <tr>
+      <th>5</th>
+      <td>(1, 4)</td>
+      <td>Profession</td>
+      <td>BornIn</td>
+      <td>0.13</td>
+      <td>0.11</td>
+    </tr>
+    <tr>
+      <th>6</th>
+      <td>(2, 2)</td>
+      <td>ActedIn</td>
+      <td>ActedIn</td>
+      <td>0.21</td>
+      <td>NaN</td>
+    </tr>
+    <tr>
+      <th>7</th>
+      <td>(2, 4)</td>
+      <td>ActedIn</td>
+      <td>BornIn</td>
+      <td>0.13</td>
+      <td>0.11</td>
+    </tr>
+    <tr>
+      <th>8</th>
+      <td>(3, 3)</td>
+      <td>LivedIn</td>
+      <td>LivedIn</td>
+      <td>0.33</td>
+      <td>NaN</td>
+    </tr>
+    <tr>
+      <th>9</th>
+      <td>(3, 4)</td>
+      <td>LivedIn</td>
+      <td>BornIn</td>
+      <td>0.56</td>
+      <td>0.81</td>
+    </tr>
+    <tr>
+      <th>10</th>
+      <td>(3, 5)</td>
+      <td>LivedIn</td>
+      <td>Nationality</td>
+      <td>0.22</td>
+      <td>0.11</td>
+    </tr>
+    <tr>
+      <th>11</th>
+      <td>(4, 5)</td>
+      <td>BornIn</td>
+      <td>Nationality</td>
+      <td>0.44</td>
+      <td>0.36</td>
+    </tr>
+  </tbody>
+</table>
+</div>
+## statistical stack profile instrumentation
+```python
+profiler.stop()
+```
+    <pyinstrument.session.Session at 0x1416bc7f0>
+```python
+profiler.print()
+```
+      _     ._   __/__   _ _  _  _ _/_   Recorded: 17:35:45  Samples:  2526
+     /_//_/// /_\ / //_// / //_'/ //     Duration: 3.799     CPU time: 4.060
+    /   _/                      v4.6.1
+    Program: /Users/paco/src/textgraphs/venv/lib/python3.10/site-packages/ipykernel_launcher.py -f /Users/paco/Library/Jupyter/runtime/kernel-27f0c564-73f8-45ab-9f64-8b064ae1de10.json
+    3.799 IPythonKernel.dispatch_queue  ipykernel/kernelbase.py:525
+    └─ 3.791 IPythonKernel.process_one  ipykernel/kernelbase.py:511
+          [10 frames hidden]  ipykernel, IPython
+             3.680 ZMQInteractiveShell.run_ast_nodes  IPython/core/interactiveshell.py:3394
+             ├─ 2.176 <module>  ../ipykernel_4421/3358887201.py:1
+             │  └─ 2.176 GraphOfRelations.construct_gor  textgraphs/gor.py:311
+             │     ├─ 1.607 IceCreamDebugger.__call__  icecream/icecream.py:204
+             │     │     [17 frames hidden]  icecream, colorama, ipykernel, thread...
+             │     │        1.078 lock.acquire  <built-in>
+             │     └─ 0.566 GraphOfRelations._transformed_triples  textgraphs/gor.py:275
+             │        └─ 0.563 IceCreamDebugger.__call__  icecream/icecream.py:204
+             │              [13 frames hidden]  icecream, colorama, ipykernel, zmq, t...
+             ├─ 0.866 <module>  ../ipykernel_4421/4061275008.py:1
+             │  └─ 0.866 GraphOfRelations.seeds  textgraphs/gor.py:197
+             │     └─ 0.865 IceCreamDebugger.__call__  icecream/icecream.py:204
+             │           [42 frames hidden]  icecream, inspect, posixpath, <built-...
+             ├─ 0.362 <module>  ../ipykernel_4421/559531165.py:1
+             │  ├─ 0.234 show  matplotlib/pyplot.py:482
+             │  │     [32 frames hidden]  matplotlib, matplotlib_inline, IPytho...
+             │  └─ 0.128 GraphOfRelations.render_gor_plt  textgraphs/gor.py:522
+             │     └─ 0.104 draw_networkx  networkx/drawing/nx_pylab.py:127
+             │           [6 frames hidden]  networkx, matplotlib
+             ├─ 0.197 <module>  ../ipykernel_4421/1169542473.py:1
+             │  └─ 0.197 IceCreamDebugger.__call__  icecream/icecream.py:204
+             │        [14 frames hidden]  icecream, colorama, ipykernel, thread...
+             └─ 0.041 <module>  ../ipykernel_4421/2247466716.py:1
+## outro
+_\[ more parts are in progress, getting added to this demo \]_

docs/ex1_0_files/ex1_0_22_0.png ADDED Viewed