diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..feaf570b8323de75398ed7a6eab69be96cd298e5 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,37 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +*.graffle filter=lfs diff=lfs merge=lfs -text +docs/assets/textgraphs.graffle filter=lfs diff=lfs merge=lfs -text diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 0000000000000000000000000000000000000000..ff5aebe20aae97ecd7ec498d3793bcfcbb297b33 --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1 @@ +github: ceteri diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000000000000000000000000000000000000..85006f328bf2e933bbd5376e87a3ac54c2ef0412 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,9 @@ +# Please see the documentation for all configuration options: +# https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates + +version: 2 +updates: + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "daily" diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000000000000000000000000000000000000..dfecbe31352b9b9dfd1da134998c1278a6996c1f --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,38 @@ +name: CI + +on: [pull_request, workflow_dispatch] + +jobs: +# pre-commit: +# name: Run pre-commit +# runs-on: ubuntu-latest +# steps: +# - uses: actions/checkout@v3 +# - uses: actions/setup-python@v3 +# - uses: pre-commit/action@v3.0.0 + + test: + name: Tests for Python ${{ matrix.python-version }} + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ['3.10'] + fail-fast: false +# needs: pre-commit + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + pip install -e . + pip install -r requirements-dev.txt + + - name: Run tests + run: | + pytest diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..fb05fdbed26fbcd57d39e83e5f60ea990573ca7e --- /dev/null +++ b/.gitignore @@ -0,0 +1,173 @@ +# local files +*~ +chromedriver +lemma.json +lemma.ttl +lemma.zip +lemma_graph.zip +examples/tmp.*.html +vis.html +gor.html +txg.tgz +s2v_old/ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..749eea3a13bc7e27b7df9e7558157c00fd5506b4 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,37 @@ +# See https://pre-commit.com for more information +# See https://pre-commit.com/hooks.html for more hooks +default_stages: [commit, push] +default_language_version: + python: python3 +exclude: "deprecated" +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + - id: trailing-whitespace + exclude: ^docs/ + - id: check-builtin-literals + - id: check-executables-have-shebangs + - id: check-merge-conflict + - id: check-json + - id: check-yaml + - id: debug-statements + - id: detect-private-key +- repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.4.1 + hooks: + - id: mypy # type annotations + exclude: ^tests/,^venv/ +- repo: https://github.com/PyCQA/pylint + rev: v2.17.4 + hooks: + - id: pylint + exclude: error.py +- repo: https://github.com/codespell-project/codespell + rev: v2.2.4 + hooks: + - id: codespell # spell-check source code + args: ["-L", "basf,textgraph,udo"] # comma separated stop words + exclude: ^README.md|^NOTES.md|^examples|^docs/ack.md|^docs/biblio.md + language: python + types: [text] diff --git a/CITATION b/CITATION new file mode 100644 index 0000000000000000000000000000000000000000..962f4ea7f5f87620d47e3cfe2262b98b3bd73a8b --- /dev/null +++ b/CITATION @@ -0,0 +1,8 @@ +@software{TextGraphs, + author = {Paco Nathan}, + title = {{TextGraphs + LLMs + graph ML for entity extraction, linking, ranking, and constructing a lemma graph}}, + year = 2023, + publisher = {Derwen}, + doi = {10.5281/zenodo.10431783}, + url = {https://github.com/DerwenAI/textgraphs} +} diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..de622b8049b0c97aff84e797a7489c53b424a9ae --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023-2024 Derwen, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000000000000000000000000000000000000..4616dd20e854333d45ac4221a807e4cdb268e45c --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,10 @@ +include LICENSE +include README.md +include pyproject.toml +include requirements.txt +include setup.py +include tests/*.py +include textgraphs/*.py +prune .ipynb_checkpoints +prune docs +prune venv diff --git a/NOTES.md b/NOTES.md new file mode 100644 index 0000000000000000000000000000000000000000..1b30de306a51929a0c4ff59d4796b279aff05b15 --- /dev/null +++ b/NOTES.md @@ -0,0 +1,39 @@ +TODO: + + * can we build a causal graph of the provenance? + - https://www.pywhy.org/dowhy/v0.11.1/ + + * target publications: + - https://drops.dagstuhl.de/entities/issue/TGDK-volume-1-issue-1 + + * impl a _semantic random walk_ from a source KG + + * link entities for lemmas, noun chunks using MediaWiki lookups? + - apply default semantics: `skos:related` + + * eval clustering/community detection for GOR? + - https://github.com/MengLiuPurdue/LocalGraphClustering + + * RAG example + - https://docs.llamaindex.ai/en/latest/examples/index_structs/knowledge_graph/KuzuGraphDemo.html#query-with-embeddings + + * extend GOR to replicate NodePiece/ULTRA ? + + * reify GOR, then use FastRP to generate embeddings? + - https://github.com/Knorreman/fastRP + + * eval community detection to condense nodes using k-medoids? + - https://medium.com/neo4j/clustering-graph-data-with-k-medoids-3b6a67ea0873 + + * add conda packaging + - https://conda.github.io/grayskull/ + + + * SPARQL the DBPedia/Wikidata equivs + + * other NER/RE: + - https://github.com/dwadden/dygiepp?tab=readme-ov-file#pretrained-models + + * check out https://github.com/wikipedia2vec/wikipedia2vec + + * link `sense2vec` synonyms; make affordances for UI to annotate synonyms diff --git a/PROMPT.md b/PROMPT.md new file mode 100644 index 0000000000000000000000000000000000000000..03c16e811975fda57d99af1d8b0f403b78d77782 --- /dev/null +++ b/PROMPT.md @@ -0,0 +1,15 @@ +https://medium.com/@nizami_muhammad/extracting-relation-from-sentence-using-llm-597d0c0310a8 + +Sentence: Werner Herzog is the son of Dietrich Herzog +Extract RDF predicate from the sentence in this format: +subject: +predicate: +object: + +--- + +Sentence: Werner Herzog is a remarkable filmmaker and an intellectual originally from Germany, the son of Dietrich Herzog. After the war, Werner fled to America to become famous. Instead he became President and decided to nuke Slovenia. +Be brief, extract the top RDF predicate in DBPedia for the relation between in this format: +subject: +predicate: +object: \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..972677e6c2ce6f5ee260097f3c31976ffd7fdaa8 --- /dev/null +++ b/README.md @@ -0,0 +1,118 @@ +--- +title: TextGraphs +emoji: ✴ +colorFrom: green +colorTo: gray +sdk: streamlit +sdk_version: 1.28.2 +app_file: app.py +pinned: false +license: mit +--- + + +# TextGraphs + +[![DOI](https://zenodo.org/badge/735568863.svg)](https://zenodo.org/doi/10.5281/zenodo.10431783) +![Licence](https://img.shields.io/github/license/DerwenAI/textgraphs) +[![Checked with mypy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/) +![CI](https://github.com/DerwenAI/textgraphs/workflows/CI/badge.svg) +
+![Repo size](https://img.shields.io/github/repo-size/DerwenAI/textgraphs) +![downloads](https://img.shields.io/pypi/dm/textgraphs) +![sponsor](https://img.shields.io/github/sponsors/ceteri) + +TextGraphs logo + + +## project info + +Project home: + +Full documentation: + +Sample code is provided in `demo.py` + + +## requirements + + * Python 3.10+ + + +## deploy library from PyPi + +Prepare the virtual environment: + +```bash +python3 -m venv venv +source venv/bin/activate +python3 -m pip install -U pip wheel setuptools +``` + +Install from [PyPi](https://pypi.python.org/pypi/textgraphs): + +```bash +python3 -m pip install -U textgraphs +``` + + +## run demos locally + +```bash +python3 demo.py +``` + +```bash +streamlit run app.py +``` + + +## install library from source locally + +```bash +python3 -m venv venv +source venv/bin/activate + +python3 -m pip install -U pip wheel setuptools +python3 -m pip install -e . +``` + +To run the Streamlit or JupyterLab demos, also install: + +```bash +python3 -m pip install -r requirements-dev.txt +``` + + +## license and copyright + +Source code for **TextGraphs** plus its logo, documentation, and +examples have an [MIT license](https://spdx.org/licenses/MIT.html) +which is succinct and simplifies use in commercial applications. + +All materials herein are Copyright © 2023-2024 Derwen, Inc. + + +## attribution + +Please use the following BibTeX entry for citing **TextGraphs** if you +use it in your research or software: +```bibtex +@software{TextGraphs, + author = {Paco Nathan}, + title = {{TextGraphs + LLMs + graph ML for entity extraction, linking, ranking, and constructing a lemma graph}}, + year = 2023, + publisher = {Derwen}, + doi = {10.5281/zenodo.10431783}, + url = {https://github.com/DerwenAI/textgraphs} +} +``` + + +## star history + +[![Star History Chart](https://api.star-history.com/svg?repos=derwenai/textgraphs&type=Date)](https://star-history.com/#derwenai/textgraphs&Date) diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000000000000000000000000000000000000..a1c0f232d8547ce943d636c7b14ed08a69d00d31 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,14 @@ +# Security Policy + +## Supported Versions + +Versions which are currently being supported with security updates: + +| Version | Supported | +| ------- | ------------------ | +| > 0.2 | :white_check_mark: | + +## Reporting a Vulnerability + +To report a vulnerability, please create a new [*issue*](https://github.com/DerwenAI/textgraphs/issues). +We will be notified immediately, and will attempt to respond on the reported issue immediately. diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..3a70af227e5035d4d60bfb864a26bf32c781a38e --- /dev/null +++ b/app.py @@ -0,0 +1,459 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=C0301 + +""" +HuggingFace Spaces demo of the `TextGraphs` library using Streamlit + +see copyright/license https://huggingface.co/spaces/DerwenAI/textgraphs/blob/main/README.md +""" + +import pathlib +import time +import typing + +import matplotlib.pyplot as plt # pylint: disable=E0401 +import pandas as pd # pylint: disable=E0401 +import pyvis # pylint: disable=E0401 +import spacy # pylint: disable=E0401 +import streamlit as st # pylint: disable=E0401 + +import textgraphs + + +if __name__ == "__main__": + # default text input + SRC_TEXT: str = """ +Werner Herzog is a remarkable filmmaker and intellectual originally from Germany, the son of Dietrich Herzog. + """ + + # store the initial value of widgets in session state + if "visibility" not in st.session_state: + st.session_state.visibility = "visible" + st.session_state.disabled = False + + with st.container(): + st.title("demo: TextGraphs + LLMs to construct a 'lemma graph'") + st.markdown( + """ +docs: +    +DOI: 10.5281/zenodo.10431783 + """, + unsafe_allow_html = True, + ) + + + # collect input + config + st.subheader("configure", divider = "rainbow") + + text_input: str = st.text_area( + "Source Text:", + value = SRC_TEXT.strip(), + ) + + llm_ner = st.checkbox( + "enhance spaCy NER using: SpanMarker", + value = False, + ) + + link_ents = st.checkbox( + "link entities using: DBPedia Spotlight, WikiMedia API", + value = False, + ) + + infer_rel = st.checkbox( + "infer relations using: REBEL, OpenNRE, qwikidata", + value = False, + ) + + if text_input or llm_ner or link_ents or infer_rel: + ## parse the document + st.subheader("parse the raw text", divider = "rainbow") + start_time: float = time.time() + + # generally it is fine to use factory defaults, + # although let's illustrate these settings here + infer_rels: list = [] + + if infer_rel: + with st.spinner(text = "load rel models..."): + infer_rels = [ + textgraphs.InferRel_OpenNRE( + model = textgraphs.OPENNRE_MODEL, + max_skip = textgraphs.MAX_SKIP, + min_prob = textgraphs.OPENNRE_MIN_PROB, + ), + textgraphs.InferRel_Rebel( + lang = "en_XX", + mrebel_model = textgraphs.MREBEL_MODEL, + ), + ] + + ner: typing.Optional[ textgraphs.Component ] = None + + if llm_ner: + ner = textgraphs.NERSpanMarker( + ner_model = textgraphs.NER_MODEL, + ) + + tg: textgraphs.TextGraphs = textgraphs.TextGraphs( + factory = textgraphs.PipelineFactory( + spacy_model = textgraphs.SPACY_MODEL, + ner = ner, + kg = textgraphs.KGWikiMedia( + spotlight_api = textgraphs.DBPEDIA_SPOTLIGHT_API, + dbpedia_search_api = textgraphs.DBPEDIA_SEARCH_API, + dbpedia_sparql_api = textgraphs.DBPEDIA_SPARQL_API, + wikidata_api = textgraphs.WIKIDATA_API, + min_alias = textgraphs.DBPEDIA_MIN_ALIAS, + min_similarity = textgraphs.DBPEDIA_MIN_SIM, + ), + infer_rels = infer_rels, + ), + ) + + duration: float = round(time.time() - start_time, 3) + st.write(f"set up: {round(duration, 3)} sec") + + with st.spinner(text = "parse text..."): + start_time = time.time() + + pipe: textgraphs.Pipeline = tg.create_pipeline( + text_input.strip(), + ) + + duration = round(time.time() - start_time, 3) + st.write(f"parse text: {round(duration, 3)} sec, {len(text_input)} characters") + + # render the entity html + ent_html: str = spacy.displacy.render( + pipe.ner_doc, + style = "ent", + jupyter = False, + ) + + st.markdown( + ent_html, + unsafe_allow_html = True, + ) + + # generate dependencies as an SVG + dep_svg = spacy.displacy.render( + pipe.ner_doc, + style = "dep", + jupyter = False, + ) + + st.image( + dep_svg, + width = 800, + use_column_width = "never", + ) + + + ## collect graph elements from the parse + st.subheader("construct the base level of the lemma graph", divider = "rainbow") + start_time = time.time() + + tg.collect_graph_elements( + pipe, + debug = False, + ) + + duration = round(time.time() - start_time, 3) + st.write(f"collect elements: {round(duration, 3)} sec, {len(tg.nodes)} nodes, {len(tg.edges)} edges") + + ## perform entity linking + if link_ents: + st.subheader("extract entities and perform entity linking", divider = "rainbow") + + with st.spinner(text = "entity linking..."): + start_time = time.time() + + tg.perform_entity_linking( + pipe, + debug = False, + ) + + duration = round(time.time() - start_time, 3) + st.write(f"entity linking: {round(duration, 3)} sec") + + + ## perform relation extraction + if infer_rel: + st.subheader("infer relations", divider = "rainbow") + st.write("NB: this part runs an order of magnitude more *slooooooowly* on HF Spaces") + + with st.spinner(text = "relation extraction..."): + start_time = time.time() + + # NB: run this iteratively since Streamlit on HF Spaces is *sloooooooooow* + inferred_edges: list = tg.infer_relations( + pipe, + debug = False, + ) + + duration = round(time.time() - start_time, 3) + + n_list: list = list(tg.nodes.values()) + + df_rel: pd.DataFrame = pd.DataFrame.from_dict([ + { + "src": n_list[edge.src_node].text, + "dst": n_list[edge.dst_node].text, + "rel": edge.rel, + "weight": edge.prob, + } + for edge in inferred_edges + ]) + + st.dataframe(df_rel) + st.write(f"relation extraction: {round(duration, 3)} sec, {len(df_rel)} edges") + + + ## construct the _lemma graph_ + start_time = time.time() + + tg.construct_lemma_graph( + debug = False, + ) + + duration = round(time.time() - start_time, 3) + st.write(f"construct graph: {round(duration, 3)} sec") + + + ## rank the extracted phrases + st.subheader("rank the extracted phrases", divider = "rainbow") + start_time = time.time() + + tg.calc_phrase_ranks( + pr_alpha = textgraphs.PAGERANK_ALPHA, + debug = False, + ) + + df_ent: pd.DataFrame = tg.get_phrases_as_df() + + duration = round(time.time() - start_time, 3) + st.write(f"extract: {round(duration, 3)} sec, {len(df_ent)} entities") + + st.dataframe(df_ent) + + + ## generate a word cloud + st.subheader("generate a word cloud", divider = "rainbow") + + render: textgraphs.RenderPyVis = tg.create_render() + wordcloud = render.generate_wordcloud() + + st.image( + wordcloud.to_image(), + width = 700, + use_column_width = "never", + ) + + + ## visualize the lemma graph + st.subheader("visualize the lemma graph", divider = "rainbow") + st.markdown( + """ + what you get at this stage is a relatively noisy, + low-level detailed graph of the parsed text + + the most interesting nodes will probably be either + subjects (`nsubj`) or direct objects (`pobj`) + """ + ) + + pv_graph: pyvis.network.Network = render.render_lemma_graph( + debug = False, + ) + + pv_graph.force_atlas_2based( + gravity = -38, + central_gravity = 0.01, + spring_length = 231, + spring_strength = 0.7, + damping = 0.8, + overlap = 0, + ) + + pv_graph.show_buttons(filter_ = [ "physics" ]) + pv_graph.toggle_physics(True) + + py_html: pathlib.Path = pathlib.Path("vis.html") + pv_graph.save_graph(py_html.as_posix()) + + st.components.v1.html( + py_html.read_text(encoding = "utf-8"), + height = render.HTML_HEIGHT_WITH_CONTROLS, + scrolling = False, + ) + + + ## cluster the communities + st.subheader("cluster the communities", divider = "rainbow") + st.markdown( + """ +
+ About this clustering... +

+In the tutorial +"How to Convert Any Text Into a Graph of Concepts", +Rahul Nayak uses the +girvan-newman +algorithm to split the graph into communities, then clusters on those communities. +His approach works well for unsupervised clustering of key phrases which have been extracted from a collection of many documents. +

+

+While Nayak was working with entities extracted from "chunks" of text, not with a text graph per se, this approach is useful for identifying network motifs which can be condensed, e.g., to extract a semantic graph overlay as an abstraction layer atop a lemma graph. +

+
+
+ """, + unsafe_allow_html = True, + ) + + spring_dist_val = st.slider( + "spring distance for NetworkX clusters", + min_value = 0.0, + max_value = 10.0, + value = 1.2, + ) + + if spring_dist_val: + start_time = time.time() + fig, ax = plt.subplots() + + comm_map: dict = render.draw_communities( + spring_distance = spring_dist_val, + ) + + st.pyplot(fig) + + duration = round(time.time() - start_time, 3) + st.write(f"cluster: {round(duration, 3)} sec, {max(comm_map.values()) + 1} clusters") + + + ## transform a graph of relations + st.subheader("transform as a graph of relations", divider = "rainbow") + st.markdown( + """ +Using the topological transform given in `lee2023ingram`, construct a +_graph of relations_ for enhancing graph inference. + +
+ What does this transform provide? +

+By using a graph of relations dual representation of our graph data, first and foremost we obtain a more compact representation of the relations in the graph, and means of making inferences (e.g., link prediction) where there is substantially more invariance in the training data. +

+

+Also recognize that for a parse graph of a paragraph in the English language, the most interesting nodes will probably be either subjects (nsubj) or direct objects (pobj). Here in the graph of relations we can see illustrated how the important details from entity linking tend to cluster near either nsubj or pobj entities, connected through punctuation. This aspect is not as readily observed in the earlier visualization of the lemma graph. +

+
+ """, + unsafe_allow_html = True, + ) + + start_time = time.time() + + gor: textgraphs.GraphOfRelations = textgraphs.GraphOfRelations(tg) + gor.seeds() + gor.construct_gor() + + scores: typing.Dict[ tuple, float ] = gor.get_affinity_scores() + pv_graph = gor.render_gor_pyvis(scores) + + pv_graph.force_atlas_2based( + gravity = -38, + central_gravity = 0.01, + spring_length = 231, + spring_strength = 0.7, + damping = 0.8, + overlap = 0, + ) + + pv_graph.show_buttons(filter_ = [ "physics" ]) + pv_graph.toggle_physics(True) + + py_html = pathlib.Path("gor.html") + pv_graph.save_graph(py_html.as_posix()) + + st.components.v1.html( + py_html.read_text(encoding = "utf-8"), + height = render.HTML_HEIGHT_WITH_CONTROLS, + scrolling = False, + ) + + duration = round(time.time() - start_time, 3) + st.write(f"transform: {round(duration, 3)} sec, {len(gor.rel_list)} relations") + + ## download lemma graph + st.subheader("download the results", divider = "rainbow") + st.markdown( + """ +Download a serialized lemma graph in multiple formats: + """, + unsafe_allow_html = True, + ) + + col1, col2, col3 = st.columns(3) + + with col1: + st.download_button( + label = "download node-link", + data = tg.dump_lemma_graph(), + file_name = "lemma_graph.json", + mime = "application/json", + ) + + st.markdown( + """ +node-link: JSON data suitable for import to Neo4j, NetworkX, etc. + """, + unsafe_allow_html = True, + ) + + with col2: + st.download_button( + label = "download RDF", + data = tg.export_rdf(), + file_name = "lemma_graph.ttl", + mime = "text/turtle", + ) + + st.markdown( + """ +Turtle/N3: W3C semantic graph representation, based on RDF, OWL, SKOS, etc. + """, + unsafe_allow_html = True, + ) + + with col3: + st.download_button( + label = "download KùzuDB", + data = tg.export_kuzu(zip_name = "lemma_graph.zip"), + file_name = "lemma.zip", + mime = "application/x-zip-compressed", + ) + + st.markdown( + """ +openCypher: ZIP file of a labeled property graph in KùzuDB + """, + unsafe_allow_html = True, + ) + + + ## WIP + st.divider() + st.write("(WIP)") + + thanks: str = """ +This demo has completed, and thank you for running a Derwen space! + """ + + st.toast( + thanks, + icon ="😍", + ) diff --git a/bin/nb_md.sh b/bin/nb_md.sh new file mode 100755 index 0000000000000000000000000000000000000000..81061b9209ad9043f0564f8ffe3144cdc633a943 --- /dev/null +++ b/bin/nb_md.sh @@ -0,0 +1,15 @@ +#!/bin/bash -e -x + +for notebook_path in examples/*.ipynb; do + [ -e "$notebook_path" ] || continue + + notebook=`basename $notebook_path` + stem=`basename $notebook_path .ipynb` + + cp $notebook_path docs/$notebook + jupyter nbconvert docs/$notebook --to markdown + #exit 0 + + python3 bin/vis_doc.py docs/"$stem".md + rm docs/$notebook +done diff --git a/bin/preview.py b/bin/preview.py new file mode 100755 index 0000000000000000000000000000000000000000..31324541dc50c755902624a4abcad91dfe82c198 --- /dev/null +++ b/bin/preview.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Preview the `MkDocs` build of the online documentation. +""" + +from pathlib import PurePosixPath +import os + +from flask import Flask, redirect, send_from_directory, url_for # pylint: disable=E0401 + +DOCS_ROUTE = "/docs/" +DOCS_FILES = "../site" +DOCS_PORT = 8000 + +APP = Flask(__name__, static_folder=DOCS_FILES, template_folder=DOCS_FILES) + +APP.config["DEBUG"] = False +APP.config["MAX_CONTENT_LENGTH"] = 52428800 +APP.config["SECRET_KEY"] = "Technically, I remain uncommitted." +APP.config["SEND_FILE_MAX_AGE_DEFAULT"] = 3000 + + +@APP.route(DOCS_ROUTE, methods=["GET"]) +@APP.route(DOCS_ROUTE + "", methods=["GET"], defaults={"path": None}) +@APP.route(DOCS_ROUTE + "", methods=["GET"]) +def static_proxy (path=""): + """static route for an asset""" + if not path: + suffix = "" + else: + suffix = PurePosixPath(path).suffix + + if suffix not in [".css", ".js", ".map", ".png", ".svg", ".xml"]: + path = os.path.join(path, "index.html") + + return send_from_directory(DOCS_FILES, path) + + +@APP.route("/index.html") +@APP.route("/home/") +@APP.route("/") +def home_redirects (): + """redirect for home page""" + return redirect(url_for("static_proxy")) + + +if __name__ == "__main__": + APP.run(host="0.0.0.0", port=DOCS_PORT, debug=True) diff --git a/bin/push_pypi.sh b/bin/push_pypi.sh new file mode 100755 index 0000000000000000000000000000000000000000..2c1a9eff5ea341db10f44fdcd107f122e42aad69 --- /dev/null +++ b/bin/push_pypi.sh @@ -0,0 +1,10 @@ +#!/bin/bash -e -x + +rm -rf dist build textgraphs.egg-info +python3 -m build +twine check dist/* + +# this assumes the use of `~/.pypirc` +# https://packaging.python.org/en/latest/specifications/pypirc/ + +twine upload ./dist/* --verbose diff --git a/bin/vis_doc.py b/bin/vis_doc.py new file mode 100755 index 0000000000000000000000000000000000000000..1325e3d6a3ce02b6007e2e7a0deebd40c3814b8d --- /dev/null +++ b/bin/vis_doc.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Convert the markdown generated from Jupyter notebooks to preserve +rendered images, etc. +""" + +import os +import pathlib +import re +import sys +import time +import traceback +import typing + +from icecream import ic # pylint: disable=E0401 +from selenium import webdriver # pylint: disable=E0401 + + +class Converter: + """ +HTML/Markdown conversion + """ + PAT_HEADER = re.compile(r"^(```python\n\# for use.*production:\n.*\n```\n)", re.MULTILINE) + PAT_SOURCE = re.compile(r"\s+src\=\"(\S+)\"") + REPLACEMENT_HEADER: str = """ +!!! note + To run this notebook in JupyterLab, load [`examples/{}.ipynb`]({}/examples/{}.ipynb) + + """ + + def __init__ ( + self, + src_url: str, + ) -> None: + """ +Constructor. + """ + self.src_url: str = src_url + + + def replace_sys_header ( + self, + text: str, + stem: str, + *, + debug: bool = False, + ) -> str: + """ +Replace the initial cell in a tutorial notebook. + """ + output: typing.List[ str ] = [] + + for chunk in self.PAT_HEADER.split(text): + m_header: typing.Optional[ re.Match ] = self.PAT_HEADER.match(chunk) + + if debug: + ic(m_header) + + if m_header: + header: str = self.REPLACEMENT_HEADER.format(stem, self.src_url, stem) + output.append(header) + else: + output.append(chunk) + + return "\n".join(output) + + + def get_pyvis_html ( + self, + iframe: str, + *, + debug: bool = False, + ) -> str: + """ +Locate the HTML files generated by `PyVis` if any. +This assumes the HTML files are named `tmp.fig*.*` + """ + source_html: typing.Optional[ str ] = None + m_source: typing.Optional[ re.Match ] = self.PAT_SOURCE.search(iframe) + + if m_source: + source_html = m_source.group(1) + + if debug: + ic(source_html) + + if "tmp.fig" not in source_html: # type: ignore + # "): + in_iframe = False + + return "\n".join(output) + + +if __name__ == "__main__": + try: + conv: Converter = Converter( + "https://github.com/DerwenAI/textgraphs/blob/main", + ) + + filename: pathlib.Path = pathlib.Path(sys.argv[1]) + _parent: pathlib.Path = filename.parent + _stem: str = filename.stem + + ic(filename, _parent, _stem) + + with open(filename, "r", encoding = "utf-8") as fp: + html: str = fp.read() + + html = conv.replace_sys_header( # pylint: disable=C0103 + html, + _stem, + debug = False, # True + ) + + #print(text) + #sys.exit(0) + + html = conv.replace_pyvis_iframe( # pylint: disable=C0103 + html, + _parent, + _stem, + debug = True, # False + ) + + with open(filename, "w", encoding = "utf-8") as fp: + fp.write(html) + + except Exception as ex: # pylint: disable=W0718 + ic(ex) + traceback.print_exc() diff --git a/demo.py b/demo.py new file mode 100644 index 0000000000000000000000000000000000000000..911f8a350af8a41179432827a8201f2017b53c58 --- /dev/null +++ b/demo.py @@ -0,0 +1,220 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Sample application to demo the `TextGraphs` library. + +see copyright/license https://huggingface.co/spaces/DerwenAI/textgraphs/blob/main/README.md +""" + +import asyncio +import sys # pylint: disable=W0611 +import traceback +import time +import typing + +from icecream import ic # pylint: disable=E0401 +from pyinstrument import Profiler # pylint: disable=E0401 +import matplotlib.pyplot as plt # pylint: disable=E0401 +import pandas as pd # pylint: disable=E0401 + +import textgraphs + + +if __name__ == "__main__": + SRC_TEXT: str = """ +Werner Herzog is a remarkable filmmaker and an intellectual originally from Germany, the son of Dietrich Herzog. +After the war, Werner fled to America to become famous. +""" + + ## set up + ## NB: profiler raises handler exceptions when `concur = False` + debug: bool = False # True + concur: bool = True # False + profile: bool = True # False + + if profile: + profiler: Profiler = Profiler() + profiler.start() + + try: + start_time: float = time.time() + + tg: textgraphs.TextGraphs = textgraphs.TextGraphs( + factory = textgraphs.PipelineFactory( + spacy_model = textgraphs.SPACY_MODEL, + ner = None, #textgraphs.NERSpanMarker(), + kg = textgraphs.KGWikiMedia( + spotlight_api = textgraphs.DBPEDIA_SPOTLIGHT_API, + dbpedia_search_api = textgraphs.DBPEDIA_SEARCH_API, + dbpedia_sparql_api = textgraphs.DBPEDIA_SPARQL_API, + wikidata_api = textgraphs.WIKIDATA_API, + ), + infer_rels = [ + textgraphs.InferRel_OpenNRE( + model = textgraphs.OPENNRE_MODEL, + max_skip = textgraphs.MAX_SKIP, + min_prob = textgraphs.OPENNRE_MIN_PROB, + ), + textgraphs.InferRel_Rebel( + lang = "en_XX", + mrebel_model = textgraphs.MREBEL_MODEL, + ), + ], + ), + ) + + duration: float = round(time.time() - start_time, 3) + print(f"{duration:7.3f} sec: set up") + + + ## NLP parse + start_time = time.time() + + pipe: textgraphs.Pipeline = tg.create_pipeline( + SRC_TEXT.strip(), + ) + + duration = round(time.time() - start_time, 3) + print(f"{duration:7.3f} sec: parse text") + + + ## collect graph elements from the parse + start_time = time.time() + + tg.collect_graph_elements( + pipe, + debug = debug, + ) + + duration = round(time.time() - start_time, 3) + print(f"{duration:7.3f} sec: collect elements") + + + ## perform entity linking + start_time = time.time() + + tg.perform_entity_linking( + pipe, + debug = debug, + ) + + duration = round(time.time() - start_time, 3) + print(f"{duration:7.3f} sec: entity linking") + + + ## perform concurrent relation extraction + start_time = time.time() + + if concur: + try: + loop = asyncio.get_running_loop() + except RuntimeError: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + inferred_edges: list = loop.run_until_complete( + tg.infer_relations_async( + pipe, + debug = debug, + ) + ) + else: + inferred_edges = tg.infer_relations( + pipe, + debug = debug, + ) + + duration = round(time.time() - start_time, 3) + print(f"{duration:7.3f} sec: relation extraction") + + n_list: list = list(tg.nodes.values()) + + df_rel: pd.DataFrame = pd.DataFrame.from_dict([ + { + "src": n_list[edge.src_node].text, + "dst": n_list[edge.dst_node].text, + "rel": pipe.kg.normalize_prefix(edge.rel), + "weight": edge.prob, + } + for edge in inferred_edges + ]) + + ic(df_rel) + + + ## construct the _lemma graph_ + start_time = time.time() + + tg.construct_lemma_graph( + debug = debug, + ) + + duration = round(time.time() - start_time, 3) + print(f"{duration:7.3f} sec: construct graph") + + + ## rank the extracted phrases + start_time = time.time() + + tg.calc_phrase_ranks( + pr_alpha = textgraphs.PAGERANK_ALPHA, + debug = debug, + ) + + duration = round(time.time() - start_time, 3) + print(f"{duration:7.3f} sec: rank phrases") + + + ## show the extracted phrase results + ic(tg.get_phrases_as_df()) + + if debug: # pylint: disable=W0101 + for key, node in tg.nodes.items(): + print(key, node) + + for key, edge in tg.edges.items(): + print(key, edge) + + except Exception as ex: # pylint: disable=W0718 + ic(ex) + traceback.print_exc() + + + ## transform graph data to a _graph of relations_ + start_time = time.time() + + gor: textgraphs.GraphOfRelations = textgraphs.GraphOfRelations( + tg, + ) + + gor.seeds( + debug = False, # True + ) + + gor.construct_gor( + debug = False, # True + ) + + _scores: typing.Dict[ tuple, float ] = gor.get_affinity_scores( + debug = False, # True + ) + + duration = round(time.time() - start_time, 3) + print(f"{duration:7.3f} sec: graph of relations") + + gor.render_gor_plt(_scores) + plt.show() + + #sys.exit(0) + + + ###################################################################### + ## stack profiler report + if profile: + profiler.stop() + profiler.print() + + ## output lemma graph as JSON + with open("lemma.json", "w", encoding = "utf-8") as fp: + fp.write(tg.dump_lemma_graph()) diff --git a/docs/abstract.md b/docs/abstract.md new file mode 100644 index 0000000000000000000000000000000000000000..89ba2ffce9c03a330c6871019a1b54a8f01b2072 --- /dev/null +++ b/docs/abstract.md @@ -0,0 +1,47 @@ +# Introduction + +**DRAFT** (WIP) + +The primary goal of this project is to improve semi-automated KG construction from large collections of unstructured text sources, while leveraging feedback from domain experts and maintaining quality checks for the aggregated results. + +Typical downstream use cases for these KGs include collecting data for industrial optimization use cases based on _operations research_, as mechanisms enabling structured LLM reasoning [#besta2024topo](biblio.md#besta2024topo), and potentially new methods of integrating KG linked data directly into LLM inference [#wen2023mindmap](biblio.md#wen2023mindmap) + +To this point, this project explores hybrid applications which leverage LLMs to improve _natural language processing_ (NLP) pipeline components, which are also complemented by other deep learning models, graph queries, semantic inference, and related APIs. + +Notably, LLMs come from NLP research. +Amidst an overwhelming avalanche of contemporary news headlines, pre-print papers, celebrity researchers, industry pundits, and so on ... +the hype begs a simple question: how good are LLMs at improving the results of natural language parsing and annotation in practice? + +Granted, it is possible to use LLM chat interfaces to generate entire KGs from unstructured text sources. +Results from this brute-force approach tend to be mixed, especially when KGs rely on non-trivial controlled vocabularies and overlapping concepts. +For examples, see [#lawrence2024ttg](biblio.md#lawrence2024ttg) and [#nizami2023llm](biblio.md#nizami2023llm). + +Issues with LLM accuracy (e.g., hallucinations) may be partially addressed through use of _retrieval augmented generation_ (RAG). +Even so, this approach tends to be expensive, especially when large number of PDF documents need to be used as input. +Use of a fully-automated "black box" based on a LLM chat agent in production use cases also tends to contradict the benefits of curating a KG to collect representations of an organization's domain expertise. + +There are perhaps some deeper issues implied in this work. +To leverage "generative AI" for KGs, we must cross multiple boundaries of representation. +For example, graph ML approaches which start from graph-theoretic descriptions are losing vital information. +On the one hand, these are generally focused on _node prediction_ or _edge prediction_ tasks, which seems overly reductionist and simplistic in the context of trying to generate streams of _composable elements_ for building graphs. +On the other hand, these approaches typically get trained on _node embeddings_, _edge embeddings_, or _graph embeddings_ -- which may not quite fit the problem at hand. +Rolling back even further, the transition from NLP parsing of unstructured text sources to the construction of KGs also tends to throw away a lot of potentially useful annotations and context available from the NLP workflows. +Commonly accepted means for training LLMs from text sources directly often use tokenization which is relatively naïve about what might be structured within the data, other than linear sequences of characters. +Notably, this ignores the relationships among surface forms of text and their co-occurence with predicted entities or relations. +Some contemporary approaches to RAG use "chunked" text, attempting to link between chunks, even though this approach arguably destroys information about what is structured within that input data. +These multiple disconnects between the source data, the representation methods used in training models, and the tactics employed for applications; however, quite arguably the "applications" targeted in research projects generally stop at comparisons of benchmarks. +Overall, these disconnects indicate the need for rethinking the problem at multiple points. + +For industry uses of KGs, one frequent observation from those leading production projects is that the "last mile" of applications generally relies on _operations research_, not ML. +We must keep these needs in mind when applying "generative AI" approaches to industry use cases. +Are we developing representations which can subsequently be leveraged for dynamic programming, convex optimization, etc.? + +This project explores a different definition for "generative AI" in the context of working with KGs for production use cases. +Rather than pursue an LLM to perform all required tasks, is it possible to combine the use of smaller, more specialized models for specific tasks within the reasonably well-understood process of KG construction? +In broad strokes, can this work alternative provide counterfactuals to the contemporary trends for chat-based _prompt engineering_? + +Seeking to integrate results from several other research projects implies substantial amounts of code reuse. +It would be intractable in terms of time and funding to rewrite code and then re-evaluate models for the many research projects which are within the scope of this work. +Therefore reproducibilty of published results -- based on open source code, models, evals, etc. -- becomes a crucial factor for determining whether others projects are suitable to be adapted into KG workflows. + +For the sake of brevity, we do not define all of the terminology used, instead relying on broadly used terms in the literature. diff --git a/docs/ack.md b/docs/ack.md new file mode 100644 index 0000000000000000000000000000000000000000..e237e02db618697106c3ac79ff72f4880f59a0ed --- /dev/null +++ b/docs/ack.md @@ -0,0 +1,11 @@ +# Acknowledgements + +Community by Aneeque Ahmed from the Noun Project + +Contributors: + + - Jürgen Müller, Zahid Abul-Basher, Nihatha Lathiff, et al., @ BASF + - open source sponsors for Derwen.ai + - perspectives from the KùzuDB.com team + - perspectives from the Argilla.io team + - feedback and suggestions from participants at [Dagstuhl Seminar 24061](https://www.dagstuhl.de/24061) diff --git a/docs/assets/favicon.png b/docs/assets/favicon.png new file mode 100644 index 0000000000000000000000000000000000000000..fa810528e0968da9767e1b65b4bb30379a353c93 Binary files /dev/null and b/docs/assets/favicon.png differ diff --git a/docs/assets/hitl.png b/docs/assets/hitl.png new file mode 100644 index 0000000000000000000000000000000000000000..3ffc389c6b0a9339c80739c5b055947c8678e3f7 Binary files /dev/null and b/docs/assets/hitl.png differ diff --git a/docs/assets/logo.png b/docs/assets/logo.png new file mode 100644 index 0000000000000000000000000000000000000000..8a80c4bccd4bb323092e3d840df0b7c3d22284c0 Binary files /dev/null and b/docs/assets/logo.png differ diff --git a/docs/assets/nouns/api.png b/docs/assets/nouns/api.png new file mode 100644 index 0000000000000000000000000000000000000000..0fdda8015f568fa78d8ee622567e4dc2a71e9b07 Binary files /dev/null and b/docs/assets/nouns/api.png differ diff --git a/docs/assets/nouns/biblio.png b/docs/assets/nouns/biblio.png new file mode 100644 index 0000000000000000000000000000000000000000..12cb95bc9fc11792234b3c32bf799639b916735c Binary files /dev/null and b/docs/assets/nouns/biblio.png differ diff --git a/docs/assets/nouns/community.png b/docs/assets/nouns/community.png new file mode 100644 index 0000000000000000000000000000000000000000..3df2db4c4f6d3cdd4df079fd23b40e3f40442509 Binary files /dev/null and b/docs/assets/nouns/community.png differ diff --git a/docs/assets/nouns/concepts.png b/docs/assets/nouns/concepts.png new file mode 100644 index 0000000000000000000000000000000000000000..194b88e34ae142238450ab1a6147784506660f4f Binary files /dev/null and b/docs/assets/nouns/concepts.png differ diff --git a/docs/assets/nouns/discovery.png b/docs/assets/nouns/discovery.png new file mode 100644 index 0000000000000000000000000000000000000000..4ea0768d3446f4e7a46d3a7eac67d27c1a7a4db8 Binary files /dev/null and b/docs/assets/nouns/discovery.png differ diff --git a/docs/assets/nouns/evidence.png b/docs/assets/nouns/evidence.png new file mode 100644 index 0000000000000000000000000000000000000000..f638b1dcc065a72fbc595bf84eb8ac27758f75b1 Binary files /dev/null and b/docs/assets/nouns/evidence.png differ diff --git a/docs/assets/nouns/feedback.png b/docs/assets/nouns/feedback.png new file mode 100644 index 0000000000000000000000000000000000000000..fa3abbc124100216783f6c379bbcdc43dfb03b50 Binary files /dev/null and b/docs/assets/nouns/feedback.png differ diff --git a/docs/assets/nouns/howto.png b/docs/assets/nouns/howto.png new file mode 100644 index 0000000000000000000000000000000000000000..bbf717983bb11d50faeed1fe90b7ce5127ba096b Binary files /dev/null and b/docs/assets/nouns/howto.png differ diff --git a/docs/assets/nouns/tutorial.png b/docs/assets/nouns/tutorial.png new file mode 100644 index 0000000000000000000000000000000000000000..63c428de5255d09a892e3ba095d16a75ab748370 Binary files /dev/null and b/docs/assets/nouns/tutorial.png differ diff --git a/docs/assets/textgraphs.graffle b/docs/assets/textgraphs.graffle new file mode 100644 index 0000000000000000000000000000000000000000..40ea164acd5f9d19f8ac51acd35ee39cc0f67e1f --- /dev/null +++ b/docs/assets/textgraphs.graffle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2177f30434db8dc6534ed39b3f5a9bed3b0fbd00db26afd841f6e77c788910f2 +size 1410392 diff --git a/docs/biblio.md b/docs/biblio.md new file mode 100644 index 0000000000000000000000000000000000000000..a0cd367e40b0968a814742b332bdb428038f666b --- /dev/null +++ b/docs/biblio.md @@ -0,0 +1,232 @@ +# Bibliography + +books by b a r z i n from the Noun Project + +Where possible, the bibliography entries use conventions at + +for [*citation keys*](https://bibdesk.sourceforge.io/manual/BibDeskHelp_2.html). +Journal abbreviations come from + +based on [*ISO 4*](https://en.wikipedia.org/wiki/ISO_4) standards. +Links to online versions of cited works use +[DOI](https://www.doi.org/) +for [*persistent identifiers*](https://www.crossref.org/education/metadata/persistent-identifiers/). +When available, +[*open access*](https://peerj.com/preprints/3119v1/) +URLs are listed. + + +## – A – + +### aarsen2023ner + +["SpanMarker for Named Entity Recognition"](https://raw.githubusercontent.com/tomaarsen/SpanMarkerNER/main/thesis.pdf) +**Tom Aarsen** +*Radboud University* (2023-06-01) +> A span-level Named Entity Recognition (NER) model that aims to improve performance while reducing computational requirements. SpanMarker leverages special marker tokens and utilizes BERT-style encoders with position IDs and attention mask matrices to capture contextual information effectively. + +### auer07dbpedia + +["DBpedia: A Nucleus for a Web of Open Data"](https://doi.org/10.1007/978-3-540-76298-0_52) +**Sören Auer**, **Christian Bizer**, **Georgi Kobilarov**, **Jens Lehmann**, **Richard Cyganiak**, **Zachary Ives** +*ISWC* (2007-11-11) +> DBpedia is a community effort to extract structured information from Wikipedia and to make this information available on the Web. DBpedia allows you to ask sophisticated queries against datasets derived from Wikipedia and to link other datasets on the Web to Wikipedia data. + +## – B – + +### bachbhg17 + +["Hinge-Loss Markov Random Fields and Probabilistic Soft Logic"](https://arxiv.org/abs/1505.04406) +**Stephen Bach**, **Matthias Broecheler**, **Bert Huang**, **Lise Getoor** +*JMLR* (2017–11–17) +> We introduce two new formalisms for modeling structured data, and show that they can both capture rich structure and scale to big data. The first, hinge-loss Markov random fields (HL-MRFs), is a new kind of probabilistic graphical model that generalizes different approaches to convex inference. + +### barrière2016elsf + +["Entities, Labels, and Surface Forms"](https://doi.org/10.1007/978-3-319-41337-2_2) +**Caroline Barrière** +_Springer_ (2016-11-19) +> We will look into a first obstacle toward this seemingly simple IE goal: the fact that entities do not have normalized names. Instead, entities can be referred to by many different surface forms. + +### besta2024topo + +["Topologies of Reasoning: Demystifying Chains, Trees, and Graphs of Thoughts"](https://arxiv.org/abs/2401.14295) +**Maciej Besta**, **Florim Memedi**, **Zhenyu Zhang**, **Robert Gerstenberger**, **Nils Blach**, **Piotr Nyczyk**, **Marcin Copik**, **Grzegorz Kwasniewski**, **Jurgen Müller**, **Lukas Gianinazzi**, **Ales Kubicek**, **Hubert Niewiadomski**, **Onur Mutlu**, **Torsten Hoefler** +_ETH Zurich_ (2024-01-25) +> Introducing a blueprint and an accompanying taxonomy of prompting schemes, focusing on the underlying structure of reasoning. + +## – C – + +### cabot2023redfm + +["REDFM: a Filtered and Multilingual Relation Extraction Dataset"](https://arxiv.org/abs/2306.09802) +**Pere-Lluís Huguet Cabot**, **Simone Tedeschi**, **Axel-Cyrille Ngonga Ngomo**, **Roberto Navigli** +_ACL_ (2023-06-19) +> Relation Extraction (RE) is a task that identifies relationships between entities in a text, enabling the acquisition of relational facts and bridging the gap between natural language and structured knowledge. However, current RE models often rely on small datasets with low coverage of relation types, particularly when working with languages other than English. In this paper, we address the above issue and provide two new resources that enable the training and evaluation of multilingual RE systems. + +## – E – + +### erxlebengkmv14 + +["Introducing Wikidata to the Linked Data Web"](https://doi.org/10.1007/978-3-319-11964-9_4) +**Fredo Erxleben**, **Michael Günther**, **Markus Krötzsch**, **Julian Mendez**, **Denny Vrandečić** +_ISWC_ (2014-10-19) +> We introduce new RDF exports that connect Wikidata to the Linked Data Web. We explain the data model of Wikidata and discuss its encoding in RDF. Moreover, we introduce several partial exports that provide more selective or simplified views on the data. + +## – F – + +### feng2023kuzu + +["KÙZU Graph Database Management System"](https://www.cidrdb.org/cidr2023/papers/p48-jin.pdf) +**Xiyang Feng**, **Guodong Jin**, **Ziyi Chen**, **Chang Liu**, **Semih Salihoğlu** +_CIDR_ (2023-01-08) +> We present Kùzu, a new GDBMS we are developing at University of Waterloo that aims to integrate state-of-art storage, indexing, and query processing techniques to highly optimize for this feature set. + +## – G – + +### galkin2023ultra + +["Towards Foundation Models for Knowledge Graph Reasoning"](https://arxiv.org/abs/2310.04562) +**Mikhail Galkin**, **Xinyu Yuan**, **Hesham Mostafa**, **Jian Tang**, **Zhaocheng Zhu** +preprint (2023–10–06) +> ULTRA builds relational representations as a function conditioned on their interactions. Such a conditioning strategy allows a pre-trained ULTRA model to inductively generalize to any unseen KG with any relation vocabulary and to be fine-tuned on any graph. + +## – H – + +### hagberg2008 + +["Exploring network structure, dynamics, and function using NetworkX"](https://conference.scipy.org/proceedings/SciPy2008/paper_2/) +**Aric A. Hagberg**, **Daniel A. Schult**, **Pieter J. Swart** +_SciPy2008_ (2008-08-19) +> NetworkX is a Python language package for exploration and analysis of networks and network algorithms. The core package provides data structures for representing many types of networks, or graphs, including simple graphs, directed graphs, and graphs with parallel edges and self loops. + +### hahnr88 + +["Automatic generation of hypertext knowledge bases"](https://doi.org/10.1145/966861.45429) +**Udo Hahn**, **Ulrich Reimer** +_ACM SIGOIS_ 9:2 (1988-04-01) +> The condensation process transforms the text representation structures resulting from the text parse into a more abstract thematic description of what the text is about, filtering out irrelevant knowledge structures and preserving only the most salient concepts. + +### hamilton2020grl + +[_Graph Representation Learning_](https://www.cs.mcgill.ca/~wlh/grl_book/) +**William Hamilton** +Morgan and Claypool (pre-print 2020) +> A brief but comprehensive introduction to graph representation learning, including methods for embedding graph data, graph neural networks, and deep generative models of graphs. + +### hangyyls19 + +["OpenNRE: An Open and Extensible Toolkit for Neural Relation Extraction"](https://doi.org/10.18653/v1/D19-3029) +**Xu Han**, **Tianyu Gao**, **Yuan Yao**, **Deming Ye**, **Zhiyuan Liu**, **Maosong Sun** +*EMNLP* (2019-11-03) +> OpenNRE is an open-source and extensible toolkit that provides a unified framework to implement neural models for relation extraction (RE). + +### hartig14 + +["Reconciliation of RDF* and Property Graphs"](https://arxiv.org/abs/1409.3288) +**Olaf Hartig** +_CoRR_ (2014-11-14) +> The document proposes a formalization of the PG model and introduces well-defined transformations between PGs and RDF. + +### honnibal2020spacy + +["spaCy: Industrial-strength Natural Language Processing in Python"](https://doi.org/10.5281/zenodo.1212303) +**Matthew Honnibal**, **Ines Montani**, **Sofie Van Landeghem**, **Adriane Boyd** +*Explosion AI* (2016-10-18) +> spaCy is a library for advanced Natural Language Processing in Python and Cython. It's built on the very latest research, and was designed from day one to be used in real products. + +## – L – + +### lee2023ingram + +["InGram: Inductive Knowledge Graph Embedding via Relation Graphs"](https://arxiv.org/abs/2305.19987) +**Jaejun Lee**, **Chanyoung Chung**, **Joyce Jiyoung Whang** +_ICML_ (2023–08–17) +> In this paper, we propose an INductive knowledge GRAph eMbedding method, InGram, that can generate embeddings of new relations as well as new entities at inference time. + +### loganlpgs19 + +["Barack's Wife Hillary: Using Knowledge-Graphs for Fact-Aware Language Modeling"](https://arxiv.org/abs/1906.07241) +**Robert L. Logan IV**, **Nelson F. Liu**, **Matthew E. Peters**, **Matt Gardner**, **Sameer Singh** +_ACL_ (2019-06-20) +> We introduce the knowledge graph language model (KGLM), a neural language model with mechanisms for selecting and copying facts from a knowledge graph that are relevant to the context. + +## – M – + +### martonsv17 + +["Formalising openCypher Graph Queries in Relational Algebra"](https://doi.org/10.1007/978-3-319-66917-5_13) +**József Marton**, **Gábor Szárnyas**, **Dániel Varró** +_ADBIS_ (2017-08-25) +> We present a formal specification for openCypher, a high-level declarative graph query language with an ongoing standardisation effort. + +### mihalcea04textrank + +["TextRank: Bringing Order into Text"](https://www.aclweb.org/anthology/W04-3252/) +**Rada Mihalcea**, **Paul Tarau** +*EMNLP* pp. 404-411 (2004-07-25) +> In this paper, the authors introduce TextRank, a graph-based ranking model for text processing, and show how this model can be successfully used in natural language applications. + +## – N – + +### nathan2016ptr + +["PyTextRank, a Python implementation of TextRank for phrase extraction and summarization of text documents"](https://doi.org/10.5281/zenodo.4637885) +**Paco Nathan**, et al. +*Derwen* (2016-10-03) +> Python implementation of TextRank algorithms ("textgraphs") for phrase extraction + +### nathan2023glod + +["Graph Levels of Detail"](https://blog.derwen.ai/graph-levels-of-detail-ea4226abba55) +**Paco Nathan** +*Derwen* (2023-11-12) +> How can we work with graph data in more abstracted, aggregate perspectives? While we can run queries on graph data to compute aggregate measures, we don’t have programmatic means of “zooming out” to consider a large graph the way that one zooms out when using an online map. + +## - Q - + +### qin2023sgr + +["Semantic Random Walk for Graph Representation Learning in Attributed Graphs"](https://arxiv.org/abs/2305.06531) +**Meng Qin** +*Hong Kong University of Science and Technology* (2023-05-11) +> We introduced a novel SGR method to generally formulate the network embedding in attributed graphs as a high-order proximity based embedding task of an auxilairy weighted graph with heterogeneous entities. + +### qin2024irwe + +["IRWE: Inductive Random Walk for Joint Inference of Identity and Position Network Embedding"](https://arxiv.org/abs/2401.00651) +**Meng Qin**, **Dit-Yan Yeung** +*Hong Kong University of Science and Technology* (2024-01-01) +> Since nodes in a community should be densely connected, nodes within the same community are more likely to be reached via RWs compared with those in different communities. Therefore, nodes with similar positions (e.g., in the same community) are highly believed to have similar RW statistics. + +## - R - + +### ramage2009rwt + +["Random walks for text semantic similarity"](https://dl.acm.org/doi/10.5555/1708124.1708131) +**Daniel Ramage**, **Anna Rafferty**, **Christopher Manning** +_ACL-IJCNLP_ (2009-09-07) +> Our algorithm aggregates local relatedness information via a random walk over a graph constructed from an underlying lexical resource. The stationary distribution of the graph walk forms a “semantic signature” that can be compared to another such distribution to get a relatedness score for texts. + +## – W – + +### warmerdam2023pydata + +["Natural Intelligence is All You Need™"](https://youtu.be/C9p7suS-NGk?si=7Ohq3BV654ia2Im4) +**Vincent Warmerdam** +*PyData Amsterdam* (2023-09-15) +> In this talk I will try to show you what might happen if you allow yourself the creative freedom to rethink and reinvent common practices once in a while. As it turns out, in order to do that, natural intelligence is all you need. And we may start needing a lot of it in the near future. + +### wen2023mindmap + +["MindMap: Knowledge Graph Prompting Sparks Graph of Thoughts in Large Language Models"](https://arxiv.org/abs/2308.09729) +**Yilin Wen**, **Zifeng Wang**, **Jimeng Sun** +_arXiv_ (2023-08-17) +> We build a prompting pipeline that endows LLMs with the capability of comprehending KG inputs and inferring with a combined implicit knowledge and the retrieved external knowledge. + +### wolf2020transformers + +["Transformers: State-of-the-Art Natural Language Processing"](https://doi.org/10.18653/v1/2020.emnlp-demos.6) +**Thomas Wolf**, **Lysandre Debut**, **Victor Sanh**, **Julien Chaumond**, **Clement Delangue**, **Anthony Moi**, **Pierric Cistac**, **Tim Rault**, **Remi Louf**, **Morgan Funtowicz**, **Joe Davison**, **Sam Shleifer**, **Patrick von Platen**, **Clara Ma**, **Yacine Jernite**, **Julien Plu**, **Canwen Xu**, **Teven Le Scao**, **Sylvain Gugger**, **Mariama Drame**, **Quentin Lhoest**, **Alexander Rush** +*EMNLP* (2020-11-16) +> The library consists of carefully engineered state-of-the art Transformer architectures under a unified API. Backing this library is a curated collection of pretrained models made by and available for the community. diff --git a/docs/build.md b/docs/build.md new file mode 100644 index 0000000000000000000000000000000000000000..b42f3f0db1e5675b116224b1586411f9f61bd938 --- /dev/null +++ b/docs/build.md @@ -0,0 +1,132 @@ +# Build Instructions + +API by Adnen Kadri from the Noun Project + +!!! note + In most cases you won't need to build this package locally. + +Unless you're doing development work on the **textgraphs** library itself, +simply install based on the instructions in +["Getting Started"](https://derwen.ai/docs/txg/start/). + + +## Setup + +To set up the build environment locally: +``` +python3 -m venv venv +source venv/bin/activate +python3 -m pip install -U pip wheel setuptools + +python3 -m pip install -e . +python3 -m pip install -r requirements-dev.txt +``` + +We use *pre-commit hooks* based on [`pre-commit`](https://pre-commit.com/) +and to configure that locally: +``` +pre-commit install --hook-type pre-commit +``` + + +## Test Coverage + +This project uses +[`pytest`](https://docs.pytest.org/) +for *unit test* coverage. +Source for unit tests is in the +[`tests`](https://github.com/DerwenAI/textgraphs/tree/main/tests) +subdirectory. + +To run the unit tests: +``` +python3 -m pytest +``` + +Note that these tests run as part of the CI workflow +whenever code is updated on the GitHub repo. + + +## Online Documentation + +To generate documentation pages, you will also need to download +[`ChromeDriver`](https://googlechromelabs.github.io/chrome-for-testing/) +for your version of the `Chrome` browser, saved as `chromedriver` in +this directory. + +Source for the documentation is in the +[`docs`](https://github.com/DerwenAI/textgraphs/tree/main/docs) +subdirectory. + +To build the documentation: +``` +./bin/nb_md.sh +./pkg_doc.py docs/ref.md +mkdocs build +``` + +Then run `./bin/preview.py` and load +in your browser to preview the generated microsite locally. + +To package the generated microsite for deployment on a +web server: +``` +tar cvzf txg.tgz site/ +``` + + +## Remote Repo Updates + +To update source code repo on GitHub: + +``` +git remote set-url origin https://github.com/DerwenAI/textgraphs.git +git push +``` + +Create new releases on GitHub then run `git pull` locally prior to +updating Hugging Face or making a new package release. + +To update source code repo+demo on Hugging Face: + +``` +git remote set-url origin https://huggingface.co/spaces/DerwenAI/textgraphs +git push +``` + + +## Package Release + +To update the [release on PyPi](https://pypi.org/project/textgraphs/): +``` +./bin/push_pypi.sh +``` + + +## Packaging + +Both the spaCy and PyPi teams induce packaging errors since they +have "opinionated" views which conflict against each other and also +don't quite follow the [Python packaging standards](https://peps.python.org/pep-0621/). + +Moreover, the various dependencies here use a wide range of approaches +for model downloads: quite appropriately, the spaCy team does not want +to package their language models on PyPi. +However, they don't use more contemporary means of model download, +such as HF transformers, either -- and that triggers logging problems. +Overall, logging approaches used by the dependencies here for errors/warnings +are mostly ad-hoc. + +These three issues (packaging, model downloads, logging) pose a small nightmare +for managing Python library packaging downstream. +To that point, this project implements several workarounds so that +applications can download from PyPi. + +Meanwhile keep watch on developments of the following dependencies, +if they introduce breaking changes or move toward more standard +packaging practices: + + * `spaCy` -- model downloads, logging + * `OpenNRE` -- PyPi packaging, logging + * HF `transformers` and `tokenizers` -- logging + * WikiMedia APIs -- SSL certificate expiry diff --git a/docs/conclude.md b/docs/conclude.md new file mode 100644 index 0000000000000000000000000000000000000000..235c2081018c11b7ef783bcc613fb4cade6158ab --- /dev/null +++ b/docs/conclude.md @@ -0,0 +1,53 @@ +# Conclusions + +**DRAFT** (WIP) + +`TextGraphs` library provides a highly configurable and extensible open source Python library for the integration and evaluation of several LLM components. This has been built with attention to allowing for concurrency and parallelism for high-performance computing on distributed systems. + +TODO: + + - leverage co-reference + - leverage closure constrained by domain/range + - general => specific, uncertain => confident + +The state of _relation extraction_ is arguably immature. +While the papers in this area compare against benchmarks, their training datasets mostly have been built from Wikidata sources, and inferred relations result in _labels_ not IRIs. +This precludes downstream use of the inferred relations for semantic inference. +Ultimately, how can better training data be developed -- e.g., for relation extraction -- to improve large models used in constructing/augmenting knowledge graphs? + +## Questions for Follow Up Research + +Many existing projects produce results which are **descriptive, but not computable**. +However, given recent innovations, such as _DPO_, there appear to be many opportunities for reworking the training datasets used in +NRE and RE models, following the pattern of `Notus` + +**R1**: we have demonstrated how to leverage LLM components while emphasizing HITL (domain experts) and quality of results + + +**R2**: we have suggested areas where investments in data quality +may provide substantial gains + +One key take-away from this project is that the model deployments are relatively haphazard across a wide spectrum of performance: some of the open source dependencies use efficient frameworks such as Hugging Face `transformers` to load models, while others use ad-hoc approaches which are much less performant. + +Granted, use of LLMs and other deep learning models is expected to increase computational requirements substantially. +Given the integration of APIs, the compute, memory, and network requirements for running the `TextGraphs` library in product can be quite large. +Software engineering optimizations can reduce these requirements substantially through use of hardware acceleration, localized services, proxy/caching, and concurrency. + +However, a more effective approach would be to make investments in data quality (training datasets, benchmarks, evals, etc.) for gains within the core technologies used here: NER, RE, etc. +Data-first iterations on the model dependencies can alleviate much of this problem. + + +**R3**: we have proposed a rubric for evaluating/rating ML open source +w.r.t. production use cases + +This project integrates available open source projects across a wide range of NLP topics. +Perspectives were gained from evaluating many open source LLM projects related to NLP components, and the state of readiness for their use in production libraries overall. + +Note that reproducibility rates are abysmally low for open source which accompanies machine learning research papers. +Few project install correctly, and fewer still run without exceptions. +Even among the better available OSS project for a given research topic (e.g., _graph embeddings_, _relation extraction_) tend to not have been maintained for years. Of the projects which run, few reproduce their published results, and most are oriented toward command-line (CLI) use to prove specific benchmarks claims. +These tend to be difficult to rework into production-quality libraries, due to concerns about performance, security, licensing, etc. + +As an outcome of this inquiry, this project presents a rubric for evaluating research papers and their associated code, based on reproducibility and eventual usefulness in software implementations. + +The views expressed are those of the authors and do not reflect the official policy or position of the funding organizations. diff --git a/docs/details.md b/docs/details.md new file mode 100644 index 0000000000000000000000000000000000000000..1e31f07e8247ca532c9a3abb118e89f989f15e8d --- /dev/null +++ b/docs/details.md @@ -0,0 +1,64 @@ +This project Implements an LLM-augmented `textgraph` algorithm for +constructing a _lemma graph_ from raw, unstructured text source. + +The `TextGraphs` library is based on work developed by +[Derwen](https://derwen.ai/graph) +in 2023 Q2 for customer apps and used in our `Cysoni` +product. + +This library integrates code from: + + * [`SpanMarker`](https://github.com/tomaarsen/SpanMarkerNER/) + * [`spaCy-DBpedia-Spotlight`](https://github.com/MartinoMensio/spacy-dbpedia-spotlight) + * [`REBEL`](https://github.com/Babelscape/rebel) + * [`OpenNRE`](https://github.com/thunlp/OpenNRE/) + * [`qwikidata`](https://github.com/kensho-technologies/qwikidata) + * [`pulp`](https://github.com/coin-or/pulp) + * [`spaCy`](https://spacy.io/) + * [`HF transformers`](https://huggingface.co/docs/transformers/index) + * [`PyTextRank`](https://github.com/DerwenAI/pytextrank/) + + +For more background about early efforts which led to this line of inquiry, see the recent talks: + + * ["Language, Graphs, and AI in Industry"](https://derwen.ai/s/mqqm) + **Paco Nathan**, K1st World (2023-10-11) ([video](https://derwen.ai/s/4h2kswhrm3gc)) + * ["Language Tools for Creators"](https://derwen.ai/s/rhvg) + **Paco Nathan**, FOSSY (2023-07-13) + + +The `TextGraphs` library shows integrations of several of these kinds +of components, complemented with use of graph queries, graph algorithms, +and other related tooling. +Admittedly, the results present a "hybrid" approach: +it's not purely "generative" -- whatever that might mean. + +A core principle here is to provide results from the natural language +workflows which may be used for expert feedback. +In other words, how can we support means for leveraging +_human-in-the-loop_ (HITL) process? + +Another principle has been to create a Python library built to produced +configurable, extensible pipelines. +Care has been given to writing code that can be run concurrently +(e.g., leveraging `asyncio`), using dependencies which have +business-friendly licenses, and paying attention to security concerns. + +The library provides three main affordances for AI applications: + + 1. With the default settings, one can use `TextGraphs` to extracti ranked key phrases from raw text -- even without using any of the additional deep learning models. + + 2. Going a few further steps, one can generate an RDF or LPG graph from raw texts, and make use of _entity linking_, _relation extraction_, and other techniques to ground the natural language parsing by leveraging some knowledge graph which represents a particular domain. Default examples use WikiMedia graphs: DBPedia, Wikidata, etc. + + 3. A third set of goals for `TextGraphs` is to provide a "playground" or "gym" for evaluating _graph levels of detail_, i.e., abstraction layers for knowledge graphs, and explore some the emerging work to produced _foundation models_ for knowledge graphs through topological transforms. + +Regarding the third point, consider how language parsing produces +graphs by definition, although NLP results tend to be quite _noisy_. +The annotations inferred by NLP pipelines often get thrown out. +This seemed like a good opportunity to generate sample data for +"condensing" graphs into more abstracted representations. +In other words, patterns within the relatively noisy parse results +can be condensed into relatively refined knowledge graph elements. + +Note that while the `spaCy` library for NLP plays a central role, the +`TextGraphs` library is not intended to become a `spaCy` pipeline. diff --git a/docs/ex0_0.md b/docs/ex0_0.md new file mode 100644 index 0000000000000000000000000000000000000000..a2c4950e8696d27a6da8d67533884d519c80dd01 --- /dev/null +++ b/docs/ex0_0.md @@ -0,0 +1,689 @@ + + +!!! note + To run this notebook in JupyterLab, load [`examples/ex0_0.ipynb`](https://github.com/DerwenAI/textgraphs/blob/main/examples/ex0_0.ipynb) + + + +# demo: TextGraphs + LLMs to construct a 'lemma graph' + +_TextGraphs_ library is intended for iterating through a sequence of paragraphs. + +## environment + + +```python +from IPython.display import display, HTML, Image, SVG +import pathlib +import typing + +from icecream import ic +from pyinstrument import Profiler +import matplotlib.pyplot as plt +import pandas as pd +import pyvis +import spacy + +import textgraphs +``` + + +```python +%load_ext watermark +``` + + +```python +%watermark +``` + + Last updated: 2024-01-16T17:41:51.229985-08:00 + + Python implementation: CPython + Python version : 3.10.11 + IPython version : 8.20.0 + + Compiler : Clang 13.0.0 (clang-1300.0.29.30) + OS : Darwin + Release : 21.6.0 + Machine : x86_64 + Processor : i386 + CPU cores : 8 + Architecture: 64bit + + + + +```python +%watermark --iversions +``` + + sys : 3.10.11 (v3.10.11:7d4cc5aa85, Apr 4 2023, 19:05:19) [Clang 13.0.0 (clang-1300.0.29.30)] + spacy : 3.7.2 + pandas : 2.1.4 + matplotlib: 3.8.2 + textgraphs: 0.5.0 + pyvis : 0.3.2 + + + +## parse a document + +provide the source text + + +```python +SRC_TEXT: str = """ +Werner Herzog is a remarkable filmmaker and an intellectual originally from Germany, the son of Dietrich Herzog. +After the war, Werner fled to America to become famous. +""" +``` + +set up the statistical stack profiling + + +```python +profiler: Profiler = Profiler() +profiler.start() +``` + +set up the `TextGraphs` pipeline + + +```python +tg: textgraphs.TextGraphs = textgraphs.TextGraphs( + factory = textgraphs.PipelineFactory( + spacy_model = textgraphs.SPACY_MODEL, + ner = None, + kg = textgraphs.KGWikiMedia( + spotlight_api = textgraphs.DBPEDIA_SPOTLIGHT_API, + dbpedia_search_api = textgraphs.DBPEDIA_SEARCH_API, + dbpedia_sparql_api = textgraphs.DBPEDIA_SPARQL_API, + wikidata_api = textgraphs.WIKIDATA_API, + min_alias = textgraphs.DBPEDIA_MIN_ALIAS, + min_similarity = textgraphs.DBPEDIA_MIN_SIM, + ), + infer_rels = [ + textgraphs.InferRel_OpenNRE( + model = textgraphs.OPENNRE_MODEL, + max_skip = textgraphs.MAX_SKIP, + min_prob = textgraphs.OPENNRE_MIN_PROB, + ), + textgraphs.InferRel_Rebel( + lang = "en_XX", + mrebel_model = textgraphs.MREBEL_MODEL, + ), + ], + ), +) + +pipe: textgraphs.Pipeline = tg.create_pipeline( + SRC_TEXT.strip(), +) +``` + +## visualize the parse results + + +```python +spacy.displacy.render( + pipe.ner_doc, + style = "ent", + jupyter = True, +) +``` + + +
+ + Werner Herzog + PERSON + + is a remarkable filmmaker and an intellectual originally from + + Germany + GPE + +, the son of + + Dietrich Herzog + PERSON + +.
After the war, + + Werner + PERSON + + fled to + + America + GPE + + to become famous.
+ + + +```python +parse_svg: str = spacy.displacy.render( + pipe.ner_doc, + style = "dep", + jupyter = False, +) + +display(SVG(parse_svg)) +``` + + + +![svg](ex0_0_files/ex0_0_17_0.svg) + + + +## collect graph elements from the parse + + +```python +tg.collect_graph_elements( + pipe, + debug = False, +) +``` + + +```python +ic(len(tg.nodes.values())); +ic(len(tg.edges.values())); +``` + + ic| len(tg.nodes.values()): 36 + ic| len(tg.edges.values()): 42 + + +## perform entity linking + + +```python +tg.perform_entity_linking( + pipe, + debug = False, +) +``` + +## infer relations + + +```python +inferred_edges: list = await tg.infer_relations_async( + pipe, + debug = False, +) + +inferred_edges +``` + + + + + [Edge(src_node=0, dst_node=10, kind=, rel='https://schema.org/nationality', prob=1.0, count=1), + Edge(src_node=15, dst_node=0, kind=, rel='https://schema.org/children', prob=1.0, count=1), + Edge(src_node=27, dst_node=22, kind=, rel='https://schema.org/event', prob=1.0, count=1)] + + + +## construct a lemma graph + + +```python +tg.construct_lemma_graph( + debug = False, +) +``` + +## extract ranked entities + + +```python +tg.calc_phrase_ranks( + pr_alpha = textgraphs.PAGERANK_ALPHA, + debug = False, +) +``` + +show the resulting entities extracted from the document + + +```python +df: pd.DataFrame = tg.get_phrases_as_df() +df +``` + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
node_idtextposlabelcountweight
00Werner HerzogPROPNdbr:Werner_Herzog10.080547
110GermanyPROPNdbr:Germany10.080437
215Dietrich HerzogPROPNdbo:Person10.079048
327AmericaPROPNdbr:United_States10.079048
424WernerPROPNdbo:Person10.077633
54filmmakerNOUNowl:Thing10.076309
622warNOUNowl:Thing10.076309
732a remarkable filmmakernoun_chunkNone10.076077
87intellectualNOUNowl:Thing10.074725
913sonNOUNowl:Thing10.074725
1033an intellectualnoun_chunkNone10.074606
1134the sonnoun_chunkNone10.074606
1235the warnoun_chunkNone10.074606
+
+ + + +## visualize the lemma graph + + +```python +render: textgraphs.RenderPyVis = tg.create_render() + +pv_graph: pyvis.network.Network = render.render_lemma_graph( + debug = False, +) +``` + +initialize the layout parameters + + +```python +pv_graph.force_atlas_2based( + gravity = -38, + central_gravity = 0.01, + spring_length = 231, + spring_strength = 0.7, + damping = 0.8, + overlap = 0, +) + +pv_graph.show_buttons(filter_ = [ "physics" ]) +pv_graph.toggle_physics(True) +``` + + +```python +pv_graph.prep_notebook() +pv_graph.show("tmp.fig01.html") +``` + + tmp.fig01.html + + + + + + +![png](ex0_0_files/tmp.fig01.png) + + + + +## generate a word cloud + + +```python +wordcloud = render.generate_wordcloud() +display(wordcloud.to_image()) +``` + + + +![png](ex0_0_files/ex0_0_37_0.png) + + + +## cluster communities in the lemma graph + +In the tutorial +"How to Convert Any Text Into a Graph of Concepts", +Rahul Nayak uses the +girvan-newman +algorithm to split the graph into communities, then clusters on those communities. +His approach works well for unsupervised clustering of key phrases which have been extracted from many documents. +In contrast, Nayak was working with entities extracted from "chunks" of text, not with a text graph. + + +```python +render.draw_communities(); +``` + + + +![png](ex0_0_files/ex0_0_40_0.png) + + + +## graph of relations transform + +Show a transformed graph, based on _graph of relations_ (see: `lee2023ingram`) + + +```python +graph: textgraphs.GraphOfRelations = textgraphs.GraphOfRelations( + tg +) + +graph.seeds() +graph.construct_gor() +``` + + +```python +scores: typing.Dict[ tuple, float ] = graph.get_affinity_scores() +pv_graph: pyvis.network.Network = graph.render_gor_pyvis(scores) + +pv_graph.force_atlas_2based( + gravity = -38, + central_gravity = 0.01, + spring_length = 231, + spring_strength = 0.7, + damping = 0.8, + overlap = 0, +) + +pv_graph.show_buttons(filter_ = [ "physics" ]) +pv_graph.toggle_physics(True) + +pv_graph.prep_notebook() +pv_graph.show("tmp.fig02.html") +``` + + tmp.fig02.html + + + + + + +![png](ex0_0_files/tmp.fig02.png) + + + + +*What does this transform provide?* + +By using a _graph of relations_ dual representation of our graph data, first and foremost we obtain a more compact representation of the relations in the graph, and means of making inferences (e.g., _link prediction_) where there is substantially more invariance in the training data. + +Also recognize that for a parse graph of a paragraph in the English language, the most interesting nodes will probably be either subjects (`nsubj`) or direct objects (`pobj`). Here in the _graph of relations_ we see illustrated how the important details from _entity linking_ tend to cluster near either `nsubj` or `pobj` entities, connected through punctuation. This is not as readily observed in the earlier visualization of the _lemma graph_. + +## extract as RDF triples + +Extract the nodes and edges which have IRIs, to create an "abstraction layer" as a semantic graph at a higher level of detail above the _lemma graph_: + + +```python +triples: str = tg.export_rdf() +print(triples) +``` + + @base . + @prefix dbo: . + @prefix dbr: . + @prefix schema: . + @prefix skos: . + @prefix wd_ent: . + + dbr:Germany skos:definition "Germany (German: Deutschland, German pronunciation: [ˈdɔʏtʃlant]), constitutionally the Federal"@en ; + skos:prefLabel "Germany"@en . + + dbr:United_States skos:definition "The United States of America (USA), commonly known as the United States (U.S. or US) or America"@en ; + skos:prefLabel "United States"@en . + + dbr:Werner_Herzog skos:definition "Werner Herzog (German: [ˈvɛɐ̯nɐ ˈhɛɐ̯tsoːk]; born 5 September 1942) is a German film director"@en ; + skos:prefLabel "Werner Herzog"@en . + + wd_ent:Q183 skos:definition "country in Central Europe"@en ; + skos:prefLabel "Germany"@en . + + wd_ent:Q44131 skos:definition "German film director, producer, screenwriter, actor and opera director"@en ; + skos:prefLabel "Werner Herzog"@en . + + a dbo:Country ; + skos:prefLabel "America"@en ; + schema:event . + + a dbo:Person ; + skos:prefLabel "Dietrich Herzog"@en ; + schema:children . + + skos:prefLabel "filmmaker"@en . + + skos:prefLabel "intellectual"@en . + + skos:prefLabel "son"@en . + + a dbo:Person ; + skos:prefLabel "Werner"@en . + + a dbo:Country ; + skos:prefLabel "Germany"@en . + + skos:prefLabel "war"@en . + + a dbo:Person ; + skos:prefLabel "Werner Herzog"@en ; + schema:nationality . + + dbo:Country skos:definition "Countries, cities, states"@en ; + skos:prefLabel "country"@en . + + dbo:Person skos:definition "People, including fictional"@en ; + skos:prefLabel "person"@en . + + + + +## statistical stack profile instrumentation + + +```python +profiler.stop() +``` + + + + + + + + + +```python +profiler.print() +``` + + + _ ._ __/__ _ _ _ _ _/_ Recorded: 17:41:51 Samples: 11163 + /_//_/// /_\ / //_// / //_'/ // Duration: 57.137 CPU time: 72.235 + / _/ v4.6.1 + + Program: /Users/paco/src/textgraphs/venv/lib/python3.10/site-packages/ipykernel_launcher.py -f /Users/paco/Library/Jupyter/runtime/kernel-8ffadb7d-3b45-4e0e-a94f-f098e5ad9fbe.json + + 57.136 _UnixSelectorEventLoop._run_once asyncio/base_events.py:1832 + └─ 57.135 Handle._run asyncio/events.py:78 + [12 frames hidden] asyncio, ipykernel, IPython + 41.912 ZMQInteractiveShell.run_ast_nodes IPython/core/interactiveshell.py:3394 + ├─ 20.701 ../ipykernel_5151/1245857438.py:1 + │ └─ 20.701 TextGraphs.perform_entity_linking textgraphs/doc.py:534 + │ └─ 20.701 KGWikiMedia.perform_entity_linking textgraphs/kg.py:306 + │ ├─ 10.790 KGWikiMedia._link_kg_search_entities textgraphs/kg.py:932 + │ │ └─ 10.787 KGWikiMedia.dbpedia_search_entity textgraphs/kg.py:641 + │ │ └─ 10.711 get requests/api.py:62 + │ │ [37 frames hidden] requests, urllib3, http, socket, ssl,... + │ ├─ 9.143 KGWikiMedia._link_spotlight_entities textgraphs/kg.py:851 + │ │ └─ 9.140 KGWikiMedia.dbpedia_search_entity textgraphs/kg.py:641 + │ │ └─ 9.095 get requests/api.py:62 + │ │ [37 frames hidden] requests, urllib3, http, socket, ssl,... + │ └─ 0.768 KGWikiMedia._secondary_entity_linking textgraphs/kg.py:1060 + │ └─ 0.768 KGWikiMedia.wikidata_search textgraphs/kg.py:575 + │ └─ 0.765 KGWikiMedia._wikidata_endpoint textgraphs/kg.py:444 + │ └─ 0.765 get requests/api.py:62 + │ [7 frames hidden] requests, urllib3 + └─ 19.514 ../ipykernel_5151/1708547378.py:1 + ├─ 14.502 InferRel_Rebel.__init__ textgraphs/rel.py:121 + │ └─ 14.338 pipeline transformers/pipelines/__init__.py:531 + │ [39 frames hidden] transformers, torch, , json + ├─ 3.437 PipelineFactory.__init__ textgraphs/pipe.py:434 + │ └─ 3.420 load spacy/__init__.py:27 + │ [20 frames hidden] spacy, en_core_web_sm, catalogue, imp... + ├─ 0.900 InferRel_OpenNRE.__init__ textgraphs/rel.py:33 + │ └─ 0.888 get_model opennre/pretrain.py:126 + └─ 0.672 TextGraphs.create_pipeline textgraphs/doc.py:103 + └─ 0.672 PipelineFactory.create_pipeline textgraphs/pipe.py:508 + └─ 0.672 Pipeline.__init__ textgraphs/pipe.py:216 + └─ 0.672 English.__call__ spacy/language.py:1016 + [11 frames hidden] spacy, spacy_dbpedia_spotlight, reque... + 14.363 InferRel_Rebel.gen_triples_async textgraphs/pipe.py:188 + ├─ 13.670 InferRel_Rebel.gen_triples textgraphs/rel.py:259 + │ ├─ 12.439 InferRel_Rebel.tokenize_sent textgraphs/rel.py:145 + │ │ └─ 12.436 TranslationPipeline.__call__ transformers/pipelines/text2text_generation.py:341 + │ │ [42 frames hidden] transformers, torch, + │ └─ 1.231 KGWikiMedia.resolve_rel_iri textgraphs/kg.py:370 + │ └─ 0.753 get_entity_dict_from_api qwikidata/linked_data_interface.py:21 + │ [8 frames hidden] qwikidata, requests, urllib3 + └─ 0.693 InferRel_OpenNRE.gen_triples textgraphs/rel.py:58 + + + + +## outro + +_\[ more parts are in progress, getting added to this demo \]_ diff --git a/docs/ex0_0_files/ex0_0_17_0.svg b/docs/ex0_0_files/ex0_0_17_0.svg new file mode 100644 index 0000000000000000000000000000000000000000..bb7a5e2f0e0d57f63ace93f83d218576ea60644e --- /dev/null +++ b/docs/ex0_0_files/ex0_0_17_0.svg @@ -0,0 +1,324 @@ + + + Werner Herzog + PROPN + + + + is + AUX + + + + a + DET + + + + remarkable + ADJ + + + + filmmaker + NOUN + + + + and + CCONJ + + + + an + DET + + + + intellectual + NOUN + + + + originally + ADV + + + + from + ADP + + + + Germany, + PROPN + + + + the + DET + + + + son + NOUN + + + + of + ADP + + + + Dietrich Herzog. + PUNCT + + + + + + SPACE + + + + After + ADP + + + + the + DET + + + + war, + NOUN + + + + Werner + PROPN + + + + fled + VERB + + + + to + ADP + + + + America + PROPN + + + + to + PART + + + + become + VERB + + + + famous. + ADJ + + + + + + nsubj + + + + + + + + det + + + + + + + + amod + + + + + + + + attr + + + + + + + + cc + + + + + + + + det + + + + + + + + conj + + + + + + + + advmod + + + + + + + + prep + + + + + + + + pobj + + + + + + + + det + + + + + + + + appos + + + + + + + + prep + + + + + + + + punct + + + + + + + + dep + + + + + + + + prep + + + + + + + + det + + + + + + + + pobj + + + + + + + + nsubj + + + + + + + + prep + + + + + + + + pobj + + + + + + + + aux + + + + + + + + advcl + + + + + + + + acomp + + + + \ No newline at end of file diff --git a/docs/ex0_0_files/ex0_0_37_0.jpg b/docs/ex0_0_files/ex0_0_37_0.jpg new file mode 100644 index 0000000000000000000000000000000000000000..b635cc43be11ce16925b1df75f65777206e75cf5 Binary files /dev/null and b/docs/ex0_0_files/ex0_0_37_0.jpg differ diff --git a/docs/ex0_0_files/ex0_0_37_0.png b/docs/ex0_0_files/ex0_0_37_0.png new file mode 100644 index 0000000000000000000000000000000000000000..d0e38cbd38a5bf00cdfb224481d8deb75269f945 Binary files /dev/null and b/docs/ex0_0_files/ex0_0_37_0.png differ diff --git a/docs/ex0_0_files/ex0_0_39_0.jpg b/docs/ex0_0_files/ex0_0_39_0.jpg new file mode 100644 index 0000000000000000000000000000000000000000..06f8e7ad3e4f5d76fadd952120efcd21ef2b1de8 Binary files /dev/null and b/docs/ex0_0_files/ex0_0_39_0.jpg differ diff --git a/docs/ex0_0_files/ex0_0_39_0.png b/docs/ex0_0_files/ex0_0_39_0.png new file mode 100644 index 0000000000000000000000000000000000000000..79e6f18166f8ffa4be89492c69059339d0b187bd Binary files /dev/null and b/docs/ex0_0_files/ex0_0_39_0.png differ diff --git a/docs/ex0_0_files/ex0_0_40_0.png b/docs/ex0_0_files/ex0_0_40_0.png new file mode 100644 index 0000000000000000000000000000000000000000..60937b66f2a1f8d875d9c699a9e1cb9fcc0576f4 Binary files /dev/null and b/docs/ex0_0_files/ex0_0_40_0.png differ diff --git a/docs/ex0_0_files/ex0_0_42_0.png b/docs/ex0_0_files/ex0_0_42_0.png new file mode 100644 index 0000000000000000000000000000000000000000..7f77eac5b002bf632b1f749d2b22d438350ef40c Binary files /dev/null and b/docs/ex0_0_files/ex0_0_42_0.png differ diff --git a/docs/ex0_0_files/tmp.fig01.png b/docs/ex0_0_files/tmp.fig01.png new file mode 100644 index 0000000000000000000000000000000000000000..b6ad119b26f2ac556da37f1e59af628fa527cfd4 Binary files /dev/null and b/docs/ex0_0_files/tmp.fig01.png differ diff --git a/docs/ex0_0_files/tmp.fig02.png b/docs/ex0_0_files/tmp.fig02.png new file mode 100644 index 0000000000000000000000000000000000000000..eea8c4a4f81eb9f9489e1df8f47da02cbe228674 Binary files /dev/null and b/docs/ex0_0_files/tmp.fig02.png differ diff --git a/docs/ex1_0.md b/docs/ex1_0.md new file mode 100644 index 0000000000000000000000000000000000000000..264a8b2b5422cde6e02ad46958b4300549d8cb56 --- /dev/null +++ b/docs/ex1_0.md @@ -0,0 +1,776 @@ + + +!!! note + To run this notebook in JupyterLab, load [`examples/ex1_0.ipynb`](https://github.com/DerwenAI/textgraphs/blob/main/examples/ex1_0.ipynb) + + + +# reproduce results from the "InGram" paper + +This is an attempt to reproduce the _graph of relations_ example given in `lee2023ingram` + +## environment + + +```python +import os +import pathlib +import typing + +from icecream import ic +from pyinstrument import Profiler +import matplotlib.pyplot as plt +import pandas as pd +import pyvis + +import textgraphs +``` + + +```python +%load_ext watermark +``` + + +```python +%watermark +``` + + Last updated: 2024-01-16T17:35:45.550539-08:00 + + Python implementation: CPython + Python version : 3.10.11 + IPython version : 8.20.0 + + Compiler : Clang 13.0.0 (clang-1300.0.29.30) + OS : Darwin + Release : 21.6.0 + Machine : x86_64 + Processor : i386 + CPU cores : 8 + Architecture: 64bit + + + + +```python +%watermark --iversions +``` + + matplotlib: 3.8.2 + pandas : 2.1.4 + pyvis : 0.3.2 + textgraphs: 0.5.0 + sys : 3.10.11 (v3.10.11:7d4cc5aa85, Apr 4 2023, 19:05:19) [Clang 13.0.0 (clang-1300.0.29.30)] + + + +## load example graph + +load from a JSON file which replicates the data for the "Figure 3" example + + +```python +graph: textgraphs.GraphOfRelations = textgraphs.GraphOfRelations( + textgraphs.SimpleGraph() +) + +ingram_path: pathlib.Path = pathlib.Path(os.getcwd()) / "ingram.json" + +graph.load_ingram( + ingram_path, + debug = False, +) +``` + +set up the statistical stack profiling + + +```python +profiler: Profiler = Profiler() +profiler.start() +``` + +## decouple graph edges into "seeds" + + +```python +graph.seeds( + debug = True, +) +``` + + + --- triples in source graph --- + + + ic| edge.src_node: 0, rel_id: 1, edge.dst_node: 1 + ic| edge.src_node: 0, rel_id: 0, edge.dst_node: 2 + ic| edge.src_node: 0, rel_id: 0, edge.dst_node: 3 + ic| edge.src_node: 4, rel_id: 2, edge.dst_node: 2 + ic| edge.src_node: 4, rel_id: 2, edge.dst_node: 3 + ic| edge.src_node: 4, rel_id: 1, edge.dst_node: 5 + ic| edge.src_node: 6, rel_id: 1, edge.dst_node: 5 + ic| edge.src_node: 6, rel_id: 2, edge.dst_node: 7 + ic| edge.src_node: 6, rel_id: 4, edge.dst_node: 8 + ic| edge.src_node: 9, + + Steven_Spielberg Profession Director + Steven_Spielberg Directed Catch_Me_If_Can + Steven_Spielberg Directed Saving_Private_Ryan + Tom_Hanks ActedIn Catch_Me_If_Can + Tom_Hanks ActedIn Saving_Private_Ryan + Tom_Hanks Profession Actor + Mark_Hamil Profession Actor + Mark_Hamil ActedIn Star_Wars + Mark_Hamil BornIn California + + + rel_id: 5, edge.dst_node: 10 + ic| edge.src_node: 9, rel_id: 4, edge.dst_node: 10 + ic| edge.src_node: 9, rel_id: 3, edge.dst_node: 8 + ic| edge.src_node: 11, rel_id: 4, edge.dst_node: 12 + ic| edge.src_node: 11, rel_id: 3, edge.dst_node: 12 + ic| edge.src_node: 11, rel_id: 3, edge.dst_node: 8 + + + Brad_Pitt Nationality USA + Brad_Pitt BornIn USA + Brad_Pitt LivedIn California + Clint_Eastwood BornIn San_Francisco + Clint_Eastwood LivedIn San_Francisco + Clint_Eastwood LivedIn California + + + +```python +graph.trace_source_graph() +``` + + + --- nodes in source graph --- + n: 0, Steven_Spielberg + head: [] + tail: [(0, 'Profession', 1), (0, 'Directed', 2), (0, 'Directed', 3)] + n: 1, Director + head: [(0, 'Profession', 1)] + tail: [] + n: 2, Catch_Me_If_Can + head: [(0, 'Directed', 2), (4, 'ActedIn', 2)] + tail: [] + n: 3, Saving_Private_Ryan + head: [(0, 'Directed', 3), (4, 'ActedIn', 3)] + tail: [] + n: 4, Tom_Hanks + head: [] + tail: [(4, 'ActedIn', 2), (4, 'ActedIn', 3), (4, 'Profession', 5)] + n: 5, Actor + head: [(4, 'Profession', 5), (6, 'Profession', 5)] + tail: [] + n: 6, Mark_Hamil + head: [] + tail: [(6, 'Profession', 5), (6, 'ActedIn', 7), (6, 'BornIn', 8)] + n: 7, Star_Wars + head: [(6, 'ActedIn', 7)] + tail: [] + n: 8, California + head: [(6, 'BornIn', 8), (9, 'LivedIn', 8), (11, 'LivedIn', 8)] + tail: [] + n: 9, Brad_Pitt + head: [] + tail: [(9, 'Nationality', 10), (9, 'BornIn', 10), (9, 'LivedIn', 8)] + n: 10, USA + head: [(9, 'Nationality', 10), (9, 'BornIn', 10)] + tail: [] + n: 11, Clint_Eastwood + head: [] + tail: [(11, 'BornIn', 12), (11, 'LivedIn', 12), (11, 'LivedIn', 8)] + n: 12, San_Francisco + head: [(11, 'BornIn', 12), (11, 'LivedIn', 12)] + tail: [] + + --- edges in source graph --- + e: 0, Directed + e: 1, Profession + e: 2, ActedIn + e: 3, LivedIn + e: 4, BornIn + e: 5, Nationality + + +## construct a _graph of relations_ + +Transform the graph data into _graph of relations_ + + +```python +graph.construct_gor( + debug = True, +) +``` + + ic| node_id: 0, len(seeds + + + --- transformed triples --- + + + ): 3 + ic| trans_arc: TransArc(pair_key=(0, 1), + a_rel=1, + b_rel=0, + node_id=0, + a_dir=, + b_dir=) + ic| trans_arc: TransArc(pair_key=(0, 1), + a_rel=1, + b_rel=0, + node_id=0, + a_dir=, + b_dir=) + ic| trans_arc: TransArc(pair_key=(0, 0), + a_rel=0, + b_rel=0, + node_id=0, + a_dir=, + b_dir=) + ic| node_id: 1, len(seeds + + + + + ): 1 + ic| node_id: 2, len(seeds): 2 + ic| trans_arc: TransArc(pair_key=(0, 2), + a_rel=0, + b_rel=2, + node_id=2, + a_dir=, + b_dir=< + + (0, 2) Directed.head Catch_Me_If_Can ActedIn.head + + + RelDir.HEAD: 0>) + ic| node_id: 3, len(seeds): 2 + ic| trans_arc: TransArc(pair_key=(0, 2), + a_rel=0, + b_rel=2, + node_id=3, + a_dir=, + b_dir=) + ic| node_id + + + (0, 2) Directed.head Saving_Private_Ryan ActedIn.head + + + + : 4, len(seeds): 3 + ic| trans_arc: TransArc(pair_key=(2, 2), + a_rel=2, + b_rel=2, + node_id=4, + a_dir=, + b_dir=) + ic| trans_arc: TransArc(pair_key=(1, 2), + a_rel=2, + b_rel=1, + node_id=4, + a_dir=, + b_dir=) + ic| trans_arc: TransArc(pair_key=(1, 2) + + (2, 2) ActedIn.tail Tom_Hanks ActedIn.tail + + (1, 2) ActedIn.tail Tom_Hanks Profession.tail + + (1, 2) ActedIn.tail Tom_Hanks Profession.tail + + + , + a_rel=2, + b_rel=1, + node_id=4, + a_dir=, + b_dir=) + ic| + + + + + node_id: 5, len(seeds): 2 + ic| trans_arc: TransArc(pair_key=(1, 1), + a_rel=1, + b_rel=1, + + + (1, 1) Profession.head Actor Profession.head + + + node_id=5, + a_dir=, + b_dir=) + ic| node_id: 6, len(seeds): 3 + ic| trans_arc: TransArc(pair_key=(1, 2), + a_rel=1, + b_rel=2, + node_id=6, + a_dir=, + b_dir=) + ic| trans_arc: TransArc(pair_key=(1, 4), + a_rel=1, + b_rel=4, + node_id=6, + a_dir + + + (1, 4) Profession.tail Mark_Hamil BornIn.tail + + + =, + b_dir=) + ic| trans_arc: TransArc(pair_key=(2, 4), + a_rel=2, + b_rel=4, + node_id=6, + + + + (2, 4) ActedIn.tail Mark_Hamil BornIn.tail + + + a_dir=, + b_dir=) + ic| node_id: 7, len(seeds): 1 + ic| node_id: 8, len(seeds): 3 + ic| trans_arc: TransArc(pair_key=(3, 4), + a_rel=4, + b_rel=3, + node_id=8, + a_dir=, + b_dir=) + ic| trans_arc: TransArc(pair_key=(3, 4), + a_rel=4, + b_rel=3, + node_id=8, + a_dir=, + b_dir=) + ic| trans_arc: TransArc(pair_key=(3, 3), + a_rel=3, + b_rel=3, + node_id=8, + a_dir=, + b_dir=) + ic| node_id: 9, len(seeds): 3 + ic + + + (3, 4) BornIn.head California LivedIn.head + + (3, 3) LivedIn.head California LivedIn.head + + (4, 5) Nationality.tail Brad_Pitt BornIn.tail + + + | trans_arc: TransArc(pair_key=(4, 5), + a_rel=5, + b_rel=4, + node_id=9, + a_dir=, + b_dir=) + ic| trans_arc: TransArc(pair_key=(3, 5), + a_rel=5, + b_rel=3, + node_id=9, + a_dir=, + b_dir=< + + + (3, 5) Nationality.tail Brad_Pitt LivedIn.tail + + + RelDir.TAIL: 1>) + ic| trans_arc: TransArc(pair_key=(3, 4), + a_rel=4, + b_rel=3, + node_id=9, + a_dir=, + b_dir=) + ic| node_id: 10, len(seeds): 2 + ic| trans_arc: TransArc(pair_key=(4, 5), + a_rel=5, + b_rel=4, + node_id=10, + a_dir=, + b_dir=) + ic| node_id: 11, len(seeds): 3 + ic| trans_arc: TransArc(pair_key=(3, + + + (3, 4) BornIn.tail Brad_Pitt LivedIn.tail + + (4, 5) Nationality.head USA BornIn.head + + (3, 4) BornIn.tail Clint_Eastwood LivedIn.tail + + + 4), + a_rel=4, + b_rel=3, + node_id=11, + a_dir=, + b_dir=) + ic + + + (3, 4) BornIn.tail Clint_Eastwood LivedIn.tail + + + | trans_arc: TransArc(pair_key=(3, 4), + a_rel=4, + b_rel=3, + node_id=11, + a_dir=, + b_dir=) + ic| trans_arc: TransArc(pair_key=(3, 3), + a_rel=3, + b_rel=3, + node_id=11, + a_dir=, + b_dir=) + ic| node_id: 12, len(seeds + + + (3, 3) LivedIn.tail Clint_Eastwood LivedIn.tail + + + + ): 2 + ic| trans_arc: TransArc(pair_key=(3, 4), + a_rel=4, + b_rel=3, + node_id=12, + a_dir=, + b_dir=) + + + (3, 4) BornIn.head San_Francisco LivedIn.head + + + + +```python +scores: typing.Dict[ tuple, float ] = graph.get_affinity_scores( + debug = True, +) +``` + + + --- collect shared entity tallies --- + 0 Directed + h: 4 dict_items([(2, 4.0)]) + t: 6 dict_items([(0, 3.0), (1, 3.0)]) + 1 Profession + h: 3 dict_items([(1, 3.0)]) + t: 10 dict_items([(0, 3.0), (2, 5.0), (4, 2.0)]) + 2 ActedIn + h: 4 dict_items([(0, 4.0)]) + t: 10 dict_items([(1, 5.0), (2, 3.0), (4, 2.0)]) + 3 LivedIn + h: 8 dict_items([(3, 3.0), (4, 5.0)]) + t: 10 dict_items([(3, 3.0), (4, 5.0), (5, 2.0)]) + 4 BornIn + h: 7 dict_items([(3, 5.0), (5, 2.0)]) + t: 11 dict_items([(1, 2.0), (2, 2.0), (3, 5.0), (5, 2.0)]) + 5 Nationality + h: 2 dict_items([(4, 2.0)]) + t: 4 dict_items([(3, 2.0), (4, 2.0)]) + + + +```python +ic(scores); +``` + + ic| scores: {(0, 0): 0.3, + (0, 1): 0.2653846153846154, + (0, 2): 0.34285714285714286, + (1, 1): 0.23076923076923078, + (1, 2): 0.3708791208791209, + (1, 4): 0.13247863247863248, + (2, 2): 0.21428571428571427, + (2, 4): 0.12698412698412698, + (3, 3): 0.3333333333333333, + (3, 4): 0.5555555555555556, + (3, 5): 0.2222222222222222, + (4, 5): 0.4444444444444444} + + +## visualize the transform results + + +```python +graph.render_gor_plt(scores) +plt.show() +``` + + + +![png](ex1_0_files/ex1_0_22_0.png) + + + + +```python +pv_graph: pyvis.network.Network = graph.render_gor_pyvis(scores) + +pv_graph.force_atlas_2based( + gravity = -38, + central_gravity = 0.01, + spring_length = 231, + spring_strength = 0.7, + damping = 0.8, + overlap = 0, +) + +pv_graph.show_buttons(filter_ = [ "physics" ]) +pv_graph.toggle_physics(True) + +pv_graph.prep_notebook() +pv_graph.show("tmp.fig03.html") +``` + + tmp.fig03.html + + + + + + +![png](ex1_0_files/tmp.fig03.png) + + + + +## analysis + +As the results below above illustrate, the computed _affinity scores_ differ from what is published in `lee2023ingram`. After trying several different variations of interpretation for the paper's descriptions, the current approach provides the closest approximation that we have obtained. + + +```python +df: pd.DataFrame = graph.trace_metrics(scores) +df +``` + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
pairrel_arel_baffinityexpected
0(0, 0)DirectedDirected0.30NaN
1(0, 1)DirectedProfession0.270.22
2(0, 2)DirectedActedIn0.340.50
3(1, 1)ProfessionProfession0.23NaN
4(1, 2)ProfessionActedIn0.370.33
5(1, 4)ProfessionBornIn0.130.11
6(2, 2)ActedInActedIn0.21NaN
7(2, 4)ActedInBornIn0.130.11
8(3, 3)LivedInLivedIn0.33NaN
9(3, 4)LivedInBornIn0.560.81
10(3, 5)LivedInNationality0.220.11
11(4, 5)BornInNationality0.440.36
+
+ + + +## statistical stack profile instrumentation + + +```python +profiler.stop() +``` + + + + + + + + + +```python +profiler.print() +``` + + + _ ._ __/__ _ _ _ _ _/_ Recorded: 17:35:45 Samples: 2526 + /_//_/// /_\ / //_// / //_'/ // Duration: 3.799 CPU time: 4.060 + / _/ v4.6.1 + + Program: /Users/paco/src/textgraphs/venv/lib/python3.10/site-packages/ipykernel_launcher.py -f /Users/paco/Library/Jupyter/runtime/kernel-27f0c564-73f8-45ab-9f64-8b064ae1de10.json + + 3.799 IPythonKernel.dispatch_queue ipykernel/kernelbase.py:525 + └─ 3.791 IPythonKernel.process_one ipykernel/kernelbase.py:511 + [10 frames hidden] ipykernel, IPython + 3.680 ZMQInteractiveShell.run_ast_nodes IPython/core/interactiveshell.py:3394 + ├─ 2.176 ../ipykernel_4421/3358887201.py:1 + │ └─ 2.176 GraphOfRelations.construct_gor textgraphs/gor.py:311 + │ ├─ 1.607 IceCreamDebugger.__call__ icecream/icecream.py:204 + │ │ [17 frames hidden] icecream, colorama, ipykernel, thread... + │ │ 1.078 lock.acquire + │ └─ 0.566 GraphOfRelations._transformed_triples textgraphs/gor.py:275 + │ └─ 0.563 IceCreamDebugger.__call__ icecream/icecream.py:204 + │ [13 frames hidden] icecream, colorama, ipykernel, zmq, t... + ├─ 0.866 ../ipykernel_4421/4061275008.py:1 + │ └─ 0.866 GraphOfRelations.seeds textgraphs/gor.py:197 + │ └─ 0.865 IceCreamDebugger.__call__ icecream/icecream.py:204 + │ [42 frames hidden] icecream, inspect, posixpath, ../ipykernel_4421/559531165.py:1 + │ ├─ 0.234 show matplotlib/pyplot.py:482 + │ │ [32 frames hidden] matplotlib, matplotlib_inline, IPytho... + │ └─ 0.128 GraphOfRelations.render_gor_plt textgraphs/gor.py:522 + │ └─ 0.104 draw_networkx networkx/drawing/nx_pylab.py:127 + │ [6 frames hidden] networkx, matplotlib + ├─ 0.197 ../ipykernel_4421/1169542473.py:1 + │ └─ 0.197 IceCreamDebugger.__call__ icecream/icecream.py:204 + │ [14 frames hidden] icecream, colorama, ipykernel, thread... + └─ 0.041 ../ipykernel_4421/2247466716.py:1 + + + + +## outro + +_\[ more parts are in progress, getting added to this demo \]_ diff --git a/docs/ex1_0_files/ex1_0_22_0.png b/docs/ex1_0_files/ex1_0_22_0.png new file mode 100644 index 0000000000000000000000000000000000000000..1b967566da2d7f2b99602a8778dc8cf6c7a5a4e4 Binary files /dev/null and b/docs/ex1_0_files/ex1_0_22_0.png differ diff --git a/docs/ex1_0_files/tmp.fig01.png b/docs/ex1_0_files/tmp.fig01.png new file mode 100644 index 0000000000000000000000000000000000000000..fe2f7e8f265ef433420f25e581b47e5a573127c9 Binary files /dev/null and b/docs/ex1_0_files/tmp.fig01.png differ diff --git a/docs/ex1_0_files/tmp.fig03.png b/docs/ex1_0_files/tmp.fig03.png new file mode 100644 index 0000000000000000000000000000000000000000..0c6efe1e0a52816640fde9dc509fd96a0c29fea4 Binary files /dev/null and b/docs/ex1_0_files/tmp.fig03.png differ diff --git a/docs/ex2_0.md b/docs/ex2_0.md new file mode 100644 index 0000000000000000000000000000000000000000..874b505b3e6cef12aefd664d52cac1d05eaf96ff --- /dev/null +++ b/docs/ex2_0.md @@ -0,0 +1,249 @@ + + +!!! note + To run this notebook in JupyterLab, load [`examples/ex2_0.ipynb`](https://github.com/DerwenAI/textgraphs/blob/main/examples/ex2_0.ipynb) + + + +# bootstrap the _lemma graph_ with RDF triples + +Show how to bootstrap definitions in a _lemma graph_ by loading RDF, e.g., for synonyms. + +## environment + + +```python +from icecream import ic +from pyinstrument import Profiler +import pyvis + +import textgraphs +``` + + +```python +%load_ext watermark +``` + + +```python +%watermark +``` + + Last updated: 2024-01-16T17:35:59.608787-08:00 + + Python implementation: CPython + Python version : 3.10.11 + IPython version : 8.20.0 + + Compiler : Clang 13.0.0 (clang-1300.0.29.30) + OS : Darwin + Release : 21.6.0 + Machine : x86_64 + Processor : i386 + CPU cores : 8 + Architecture: 64bit + + + + +```python +%watermark --iversions +``` + + pyvis : 0.3.2 + textgraphs: 0.5.0 + sys : 3.10.11 (v3.10.11:7d4cc5aa85, Apr 4 2023, 19:05:19) [Clang 13.0.0 (clang-1300.0.29.30)] + + + +## load the bootstrap definitions + +Define the bootstrap RDF triples in N3/Turtle format: we define an entity `Werner` as a synonym for `Werner Herzog` by using the [`skos:broader`](https://www.w3.org/TR/skos-reference/#semantic-relations) relation. Keep in mind that this entity may also refer to other Werners... + + +```python +TTL_STR: str = """ +@base . +@prefix dbo: . +@prefix skos: . + + a dbo:Person ; + skos:prefLabel "Werner"@en . + + a dbo:Person ; + skos:prefLabel "Werner Herzog"@en. + +dbo:Person skos:definition "People, including fictional"@en ; + skos:prefLabel "person"@en . + + skos:broader . +""" +``` + +Provide the source text + + +```python +SRC_TEXT: str = """ +Werner Herzog is a remarkable filmmaker and an intellectual originally from Germany, the son of Dietrich Herzog. +After the war, Werner fled to America to become famous. +""" +``` + +set up the statistical stack profiling + + +```python +profiler: Profiler = Profiler() +profiler.start() +``` + +set up the `TextGraphs` pipeline + + +```python +tg: textgraphs.TextGraphs = textgraphs.TextGraphs( + factory = textgraphs.PipelineFactory( + kg = textgraphs.KGWikiMedia( + spotlight_api = textgraphs.DBPEDIA_SPOTLIGHT_API, + dbpedia_search_api = textgraphs.DBPEDIA_SEARCH_API, + dbpedia_sparql_api = textgraphs.DBPEDIA_SPARQL_API, + wikidata_api = textgraphs.WIKIDATA_API, + min_alias = textgraphs.DBPEDIA_MIN_ALIAS, + min_similarity = textgraphs.DBPEDIA_MIN_SIM, + ), + ), +) +``` + +load the bootstrap definitions + + +```python +tg.load_bootstrap_ttl( + TTL_STR, + debug = False, +) +``` + +parse the input text + + +```python +pipe: textgraphs.Pipeline = tg.create_pipeline( + SRC_TEXT.strip(), +) + +tg.collect_graph_elements( + pipe, + debug = False, +) + +tg.construct_lemma_graph( + debug = False, +) +``` + +## visualize the lemma graph + + +```python +render: textgraphs.RenderPyVis = tg.create_render() + +pv_graph: pyvis.network.Network = render.render_lemma_graph( + debug = False, +) +``` + +initialize the layout parameters + + +```python +pv_graph.force_atlas_2based( + gravity = -38, + central_gravity = 0.01, + spring_length = 231, + spring_strength = 0.7, + damping = 0.8, + overlap = 0, +) + +pv_graph.show_buttons(filter_ = [ "physics" ]) +pv_graph.toggle_physics(True) +``` + + +```python +pv_graph.prep_notebook() +pv_graph.show("tmp.fig04.html") +``` + + tmp.fig04.html + + + + + + +![png](ex2_0_files/tmp.fig04.png) + + + + +Notice how the `Werner` and `Werner Herzog` nodes are now linked? This synonym from the bootstrap definitions above provided means to link more portions of the _lemma graph_ than the demo in `ex0_0` with the same input text. + +## statistical stack profile instrumentation + + +```python +profiler.stop() +``` + + + + + + + + + +```python +profiler.print() +``` + + + _ ._ __/__ _ _ _ _ _/_ Recorded: 17:35:59 Samples: 2846 + /_//_/// /_\ / //_// / //_'/ // Duration: 4.111 CPU time: 3.294 + / _/ v4.6.1 + + Program: /Users/paco/src/textgraphs/venv/lib/python3.10/site-packages/ipykernel_launcher.py -f /Users/paco/Library/Jupyter/runtime/kernel-4365d4ba-2d4d-4d4b-83e2-eb5ef8abfe26.json + + 4.111 IPythonKernel.dispatch_shell ipykernel/kernelbase.py:378 + └─ 4.075 IPythonKernel.execute_request ipykernel/kernelbase.py:721 + [9 frames hidden] ipykernel, IPython + 3.995 ZMQInteractiveShell.run_ast_nodes IPython/core/interactiveshell.py:3394 + ├─ 3.250 ../ipykernel_4433/1372904243.py:1 + │ └─ 3.248 PipelineFactory.__init__ textgraphs/pipe.py:434 + │ └─ 3.232 load spacy/__init__.py:27 + │ [98 frames hidden] spacy, en_core_web_sm, catalogue, imp... + │ 0.496 tokenizer_factory spacy/language.py:110 + │ └─ 0.108 _validate_special_case spacy/tokenizer.pyx:573 + │ 0.439 spacy/language.py:2170 + │ └─ 0.085 _validate_special_case spacy/tokenizer.pyx:573 + ├─ 0.672 ../ipykernel_4433/3257668275.py:1 + │ └─ 0.669 TextGraphs.create_pipeline textgraphs/doc.py:103 + │ └─ 0.669 PipelineFactory.create_pipeline textgraphs/pipe.py:508 + │ └─ 0.669 Pipeline.__init__ textgraphs/pipe.py:216 + │ └─ 0.669 English.__call__ spacy/language.py:1016 + │ [31 frames hidden] spacy, spacy_dbpedia_spotlight, reque... + └─ 0.055 ../ipykernel_4433/72966960.py:1 + └─ 0.046 Network.prep_notebook pyvis/network.py:552 + [5 frames hidden] pyvis, jinja2 + + + + +## outro + +_\[ more parts are in progress, getting added to this demo \]_ diff --git a/docs/ex2_0_files/tmp.fig01.png b/docs/ex2_0_files/tmp.fig01.png new file mode 100644 index 0000000000000000000000000000000000000000..b2051fb64f6060ac21f9c107ee331787f5303d6c Binary files /dev/null and b/docs/ex2_0_files/tmp.fig01.png differ diff --git a/docs/ex2_0_files/tmp.fig04.png b/docs/ex2_0_files/tmp.fig04.png new file mode 100644 index 0000000000000000000000000000000000000000..e5fb2a1e160a6aa6b17bf0b4af77ddb0e495956a Binary files /dev/null and b/docs/ex2_0_files/tmp.fig04.png differ diff --git a/docs/glod.md b/docs/glod.md new file mode 100644 index 0000000000000000000000000000000000000000..c7b539b936de88bf7acbbbfb14f1ce8e343b8bf8 --- /dev/null +++ b/docs/glod.md @@ -0,0 +1,6 @@ +**TODO**: summarize from + +Overall, this approach relies on a notion of developing "abstraction layers" atop graph data: how can graphs be analyze at differing levels of detail? + +For this one must understand patterns in graphs such as network motifs and how to run topological transforms to help identify patterns. +A side benefit is that such transform can help boost the invariance of graph representation used when training models. diff --git a/docs/glossary.md b/docs/glossary.md new file mode 100644 index 0000000000000000000000000000000000000000..568afd28348df803ede3a0816f93d536adc3aecc --- /dev/null +++ b/docs/glossary.md @@ -0,0 +1,25 @@ +books by b a r z i n from the Noun Project + +**DRAFT** + +- controlled vocabulary +- entity extraction +- entity linking +- generative AI +- graph levels of detail (GLOD) +- human-in-the-loop (HITL) +- internationalized resource identifier (IRI) +- knowledge graph construction +- knowledge graph (KG) +- labeled property graph (LPG) +- large language models (LLM) +- named entity recognition (NER) +- natural language processing (NLP) +- network motifs +- prompt engineering +- relation extraction (RE) +- retrieval augmented generation (RAG) +- semantic random walk +- statistical relational learning (SRL) +- topological decomposition of graphs +- topological transforms diff --git a/docs/graph.md b/docs/graph.md new file mode 100644 index 0000000000000000000000000000000000000000..6f656a1e9bd1124290ddb0e395df0c7805c7b2cf --- /dev/null +++ b/docs/graph.md @@ -0,0 +1,28 @@ +While many papers proceed from a graph-theoretic definition `G = (V, E)` these typically fail to take into account two important aspects of graph technologies in industry practice: + + 1. _labels_ and _properties_ (key/value attribute pairs) for more effective modeling of linked data + 2. _internationalized resource identifiers_ (IRIs) as unique identifiers that map into controlled vocabularies, which can be leveraged for graph queries and semantic inference + +Industry analysts sometimes point to these two concerns being represented by competiting approaches, namely +_labeled property graphs_ (LPG) representation versus +_semantic web standards_ defined by the World Wide Web Consortium (W3C). +Efforts are in progress to harmonize both of these needs within the same graphs, such as [#hartig14](biblio.md#hartig14) for eventual standards. +However, with some discipline in data modeling practices, both of these criteria can be met within current graph frameworks, provided that: + + * nodes and edges each have specific labels which serve as IRIs that map to a set of controlled vocabularies + * nodes and edges each have properties, which include probabilities from the point of generation + +Building on definitions given in [#martonsv17](biblio.md#martonsv17), [#qin2023sgr](biblio.md#qin2023sgr), this project proceeds from the perspective of primarily using LPG graph representation, while adhering to the aforementioned data modeling discipline. + +`G = (V, E, src, tgt, lbl, P)` is an edge-labeled directed multigraph with: + + - a set of nodes V + - a set of edges E + - function `src`: E → V` that associates each edge with its source vertex + - function `tgt: E → V` that associates each edge with its target vertex + - function `lbl: E → dom(S)` that associates each edge its label + - function `P: (V ∪ E) → 2p` that associates nodes and edges with their properties + +The project architecture enables a "map-reduce" style of distributed processing, so that "chunks" of text (e.g., paragraphs) can be processed independently, with results being aggregated at the end of a batch. +The intermediate processing of each "chunk" uses `NetworkX` [#hagberg2008](biblio.md#hagberg2008) to allow for running in-memory graph algorithms and analytics, and integrate more efficiently with graph machine learning libraries. +Then an `openCypher` representation [#martonsv17](biblio.md#martonsv17) is used to serialize end results, which get aggregated using the open source `KùzuDB` graph database [#feng2023kuzu](biblio.md#feng2023kuzu) and its Python API. diff --git a/docs/hitl.md b/docs/hitl.md new file mode 100644 index 0000000000000000000000000000000000000000..2f44b66eff58e513b521c93f97bfb899de63bf04 --- /dev/null +++ b/docs/hitl.md @@ -0,0 +1,21 @@ +Rather than fully automatic KG construction, this approach emphasizes means of incorporating _domain experts_ through "human-in-the-loop" (HITL) techniques. + +Multiple techniques can be employed to construct gradients for both the generated nodes and edges, starting with the quantitative scores from model inference. + + - gradient for recommending extracted entities: _named entity recognition_, _textrank_, _probabilistic soft logic_, etc. + - gradient for recommending extracted relations: _relation extraction_, _graph of relations_, etc. + +Results extracted from _lemma graphs_ provide gradients which can be leveraged to elicit feedback from domain experts: + + - high-pass filter: accept results as valid automated inference + - low-pass filter: reject results as errors and noise + +For the results which fall in-between, a recsys or similar UI can elicit review from domain experts, based on _active learning_, _weak supervision_, etc. see + +subsequent to the HITL validation, the more valuable results collected within a _lemma graph_ can be extracted as the primary output from this approach. + +Based on a process of iterating through a text document in chunks, the results from one iteration can be used to bootstrap the _lemma graph_ for the next iteration. this provides a natural means of accumulating (i.e., aggregating) results from the overall analysis. + +By extension, this bootstrap/accumulation process can be used in the distributed processing of a corpus of documents, where the "data exhaust" of abstracted _lemma graphs_ used to bootstrap analysis workflows effectively becomes a _knowledge graph_, as a side-effect of the analysis. + + diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000000000000000000000000000000000000..9845b34871c409abe6112d82dbe420d2ba101939 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,50 @@ +# TextGraphs: raw texts, LLMs, and KGs, oh my! + +illustration of a lemma graph + +Welcome to the **TextGraphs** library... + + - demo: + - code: + - biblio: + - DOI: 10.5281/zenodo.10431783 + + +## Overview + +_Explore uses of large language models (LLMs) in semi-automated knowledge graph (KG) construction from unstructured text sources, with human-in-the-loop (HITL) affordances to incorporate guidance from domain experts._ + +What is "generative AI" in the context of working with knowledge graphs? +Initial attempts tend to fit a simple pattern based on _prompt engineering_: present text sources to a LLM-based chat interface, asking to generate an entire graph. +This is generally expensive and results are often poor. +Moreover, the lack of controls or curation in this approach represents a serious disconnect with how KGs get curated to represent an organization's domain expertise. + +Can the definition of "generative" be reformulated for KGs? +Instead of trying to use a fully-automated "black box", what if it were possible to generate _composable elements_ which then get aggregated into a KG? +Some research in topological analysis of graphs indicates potential ways to decompose graphs, which can then be re-composed probabilistically. +While the mathematics may be sound, these techniques need to be understood in the context of a full range of tasks within KG-construction workflows to assess how they can apply for real-world graph data. + +This project explores the use of LLM-augmented components within natural language workflows, focusing on small well-defined tasks within the scope of KG construction. +To address challenges in this problem, this project considers improved means of tokenization, for handling input. +In addition, a range of methods are considered for filtering and selecting elements of the output stream, re-composing them into KGs. +This has a side-effect of providing steps toward better pattern identification and variable abstraction layers for graph data, for _graph levels of detail_ (GLOD). + +Many papers aim to evaluate benchmarks, in contrast this line of inquiry focuses on integration: +means of combining multiple complementary research projects; +how to evaluate the outcomes of other projects to assess their potential usefulness in production-quality libraries; +and suggested directions for improving the LLM-based components of NLP workflows used to construct KGs. + + +## Index Terms + +_natural language processing_, +_knowledge graph construction_, +_large language models_, +_entity extraction_, +_entity linking_, +_relation extraction_, +_semantic random walk_, +_human-in-the-loop_, +_topological decomposition of graphs_, +_graph levels of detail_, +_network motifs_, diff --git a/docs/javascripts/config.js b/docs/javascripts/config.js new file mode 100644 index 0000000000000000000000000000000000000000..ece598636faadb2127b1bf4ff4ad1fd716bdcc45 --- /dev/null +++ b/docs/javascripts/config.js @@ -0,0 +1,12 @@ +window.MathJax = { + tex: { + inlineMath: [["\\(", "\\)"]], + displayMath: [["\\[", "\\]"]], + processEscapes: true, + processEnvironments: true + }, + options: { + ignoreHtmlClass: ".*|", + processHtmlClass: "arithmatex" + } +}; diff --git a/docs/lemma.md b/docs/lemma.md new file mode 100644 index 0000000000000000000000000000000000000000..60e6699f60990c28456c15b97d92c492e97d9043 --- /dev/null +++ b/docs/lemma.md @@ -0,0 +1,23 @@ +# Lemma Graph + +This project introduces the notion of a _lemma graph_ as an intermediate representation. +Effectively, this provides a kind of cache during the processing of each "chunk" of text. +Think of the end result as "enhanced tokenization" for text used to generate graph data elements. +Other projects might call this by different names: +an "evidence graph" in [#wen2023mindmap](biblio.md#wen2023mindmap) +or a "dynamically growing local KG" in [#loganlpgs19](biblio.md#loganlpgs19). + +The lemma graph collects metadata from NLP parsing, entity linking, etc., which generally get discarded in many applications. +Therefore the lemma graph becomes rather "noisy", and in most cases would be too big to store across the analysis of a large corpus. + +Leveraging this intermediate form, per chunk, collect the valuable information about nodes, edges, properties, probabilities, etc., to aggregate for the document analysis overall. + +Consequently, this project explores the use of topological transforms on graphs to enhance representations for [_graph levels of detail_](https://blog.derwen.ai/graph-levels-of-detail-ea4226abba55), i.e., being able to understand a graph a varying levels of abstraction. +Note that adjacent areas of interest include emerging work on: + + - _graph of relations_ + - _foundation models for KGs_ + +Means for "bootstrapping" a _lemma graph_ with initial semantic relations, allows for "sampling" from a curated KG to enhance the graph algorithms used, e.g., through _semantic random walks_ which allow for incorporating heterogeneous sources and relatively large-scale external KGs. +This mechanism also creates opportunities for distributed processing, because the "chunks" of text can follow a _task parallel_ pattern, accumulating the extracted results from each lemma graph into a graph database. +Augmenting a KG iteratively over time follows a similar pattern. diff --git a/docs/methods.md b/docs/methods.md new file mode 100644 index 0000000000000000000000000000000000000000..08e25394f3a4fc0abce299113086aa69a66ca942 --- /dev/null +++ b/docs/methods.md @@ -0,0 +1,39 @@ +# Technical Approach + +Construct a _lemma graph_, then perform _entity linking_ based on: +`spaCy`, `transformers`, `SpanMarkerNER`, +`spaCy-DBpedia-Spotlight`, `REBEL`, `OpenNRE`, +`qwikidata`, `pulp` + + 1. use `spaCy` to parse a document, augmented by `SpanMarker` use of LLMs for NER + 1. add noun chunks in parallel to entities, as "candidate" phrases for subsequent HITL confirmation + 1. perform _entity linking_: `spaCy-DBpedia-Spotlight`, `WikiMedia API`, etc. + 1. infer relations, plus graph inference: `REBEL`, `OpenNRE`, `qwikidata`, etc. + 1. build a _lemma graph_ in `NetworkX` from the parse results + 1. run a modified `textrank` algorithm plus graph analytics + 1. approximate a _pareto archive_ (hypervolume) to re-rank extracted entities with `pulp` + 1. visualize the _lemma graph_ interactively in `PyVis` + 1. cluster communities within the _lemma graph_ + 1. apply topological transforms to enhance graph ML and embeddings + 1. build ML models from the _graph of relations_ (in progress) + +In other words, this hybrid approach integrates +_NLP parsing_, _LLMs_, _graph algorithms_, _semantic inference_, +_operations research_, and also provides UX affordances for including +_human-in-the-loop_ practices. + +The demo app and the Hugging Face space both illustrate a relatively +small problem, although they address a much broader class of AI problems +in industry. + +This step is a prelude before leveraging +_topological transforms_, _large language models_, _graph representation learning_, +plus _human-in-the-loop_ domain expertise to infer +the nodes, edges, properties, and probabilities needed for the +semi-automated construction of _knowledge graphs_ from +raw unstructured text sources. + +In addition to providing a library for production use cases, +`TextGraphs` creates a "playground" or "gym" +in which to prototype and evaluate abstractions based on +["Graph Levels Of Detail"](https://blog.derwen.ai/graph-levels-of-detail-ea4226abba55) diff --git a/docs/nlp.md b/docs/nlp.md new file mode 100644 index 0000000000000000000000000000000000000000..41f374b50e5fb95cb92bacede0c44f37471df8c5 --- /dev/null +++ b/docs/nlp.md @@ -0,0 +1,15 @@ +The open source `spaCy` library in Python provides full-featured NLP capabilities. +[#honnibal2020spacy](biblio.md#honnibal2020spacy) +This serves as a core component of this project. +Recent releases of `spaCy` have provided features to integrate with selected large models, and also support native features for entity linking. + +On the one hand, `spaCy` pipelines offer a broad range of integrations and "opinionated" selections for both utility and ease of use. +The resulting pipelines are optimized for annotating streams of spans of tokens. +On the other hand, the opinionated API calls and the abstractions use for pipeline construction and configuration present some important constraints: + + - Pipelines are not especially well-suited for propagating other forms of generated data, beyond token/span streams. + - Tokenization used in `spaCy` does not align with the requirements for relation extraction projects of interest. + - Entity linking capabilities rely on using an internally defined "knowledge base" which is not well-suited for integrating with heterogeneous resources. + +Consequently, while `spaCy` serves as a core component for NLP capabilities, this project presents a library of Python class definitions for KG construction which can be extended and configured to accommodate a broad range of LLM components. +These "less opinionated" pipeline definitions, in the broader scope, are optimized for managing streams of KG candidate elements which have been produced by generative AI. diff --git a/docs/objectives.md b/docs/objectives.md new file mode 100644 index 0000000000000000000000000000000000000000..baf59fa5134fa00194abf099333c2e39a096483d --- /dev/null +++ b/docs/objectives.md @@ -0,0 +1,19 @@ +Consider three classes of composable elements which are needed for constructing KGs: *nodes*, *edges*, *properties*. +Several areas of machine learning (ML) research can be leveraged to generate these elements from unstructured text sources: + + - nodes: NER, node prediction + - edges: relation extraction (RE), semantic inference, link prediction + - properties: NLP parse, entity linking, graph analytics + +Weights or probabilities from the analysis can also be used to construct *gradients* for ranking each class of elements in the generated output. +This supports multiple approaches for filtering, selection, and abstraction of the generated composable elements, and helps incorporate domain expertise. + +A set of questions follows from this line of inquiry: + +**RQ1**: can workflows be defined which integrate LLM-based components and generate _composable elements_ for KGs, while managing the quality of the generated results? + +**RQ2**: can topological analysis and decomposition of graph data help inform better ways to generating graph elements, e.g., by leveraging patterns within graphs (network motifs) and graph abstraction layers? + +**RQ3**: where might it be possible to improve data quality for -- training data, benchmarks, evals, etc. -- then iterate to train more effective LLM-based components? + +**RQ4**: how can consistent evaluations of open source related to ML research be made, assessing opportunities for reusing code in production-quality libraries? diff --git a/docs/prob.md b/docs/prob.md new file mode 100644 index 0000000000000000000000000000000000000000..92041a30e6407ee1e4054d3db2118fa908642409 --- /dev/null +++ b/docs/prob.md @@ -0,0 +1,19 @@ +**TODO**: summarize from + +results from the combined analysis get collected into an intermediate form which is a probabilistic structure called a _lemma graph_. + +note: NLP parsers tend to produce a wealth of annotations from raw text, most of which are thrown away in many application. what if instead this parse information got collected together, temporarily while analyzing a chunk of text? + +an application running in production most likely would not want to persist the entirety of _lemma graph_ data generated during analysis of a full corpus. instead, consider this structure as a kind of temporary cache during the analysis for one unit of work, i.e., a "chunk" of text. + +from the pragmatics of writing, editing, and critical review, a natural size for this kind of chunking is to analyze at the paragraph level. in some domains, such as analysis of patent applications, chunking at the level of "claims" might be indicated. + +the probabilistic aspects of the intermediate _lemma graph_ data become especially important in a linguistic context: + + * entities have many _surface forms_ + * synonyms (synsets) change meanings in different domains, especially when abbreviated + * ambiguous references may exist, though not all are important to resolve based on "premature optimization" + +Note that semantic modeling practices using RDF tend to have a relatively trivial notion of "synonyms", notably by annotating a subject with one _preferred label_ and zero or more additional labels. +This may be sufficiently descriptive for building taxonomies manually; however, this approach is not sufficient for making the modeled representation computable in light of the many kinds of _surface forms_ and possible sources of ambiguity. +The RDF representation uses `skos:broader` to connect surface forms, and the LPG representation uses probabilities to manage disambiguation these terms. diff --git a/docs/ref.md b/docs/ref.md new file mode 100644 index 0000000000000000000000000000000000000000..90633b454d15251a921dff8a703dddc456cc0033 --- /dev/null +++ b/docs/ref.md @@ -0,0 +1,1646 @@ +# Reference: `textgraphs` package +API by Adnen Kadri from the Noun Project +Package definitions for the `TextGraphs` library. + +see copyright/license https://huggingface.co/spaces/DerwenAI/textgraphs/blob/main/README.md + + +## [`TextGraphs` class](#TextGraphs) + +Construct a _lemma graph_ from the unstructured text source, +then extract ranked phrases using a `textgraph` algorithm. + +--- +#### [`infer_relations_async` method](#textgraphs.TextGraphs.infer_relations_async) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/doc.py#L641) + +```python +infer_relations_async(pipe, debug=False) +``` +Gather triples representing inferred relations and build edges, +concurrently by running an async queue. + + +Make sure to call beforehand: `TextGraphs.collect_graph_elements()` + + * `pipe` : `textgraphs.pipe.Pipeline` +configured pipeline for this document + + * `debug` : `bool` +debugging flag + + * *returns* : `typing.List[textgraphs.elem.Edge]` +a list of the inferred `Edge` objects + + + +--- +#### [`__init__` method](#textgraphs.TextGraphs.__init__) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/doc.py#L80) + +```python +__init__(factory=None, iri_base="https://github.com/DerwenAI/textgraphs/ns/") +``` +Constructor. + + * `factory` : `typing.Optional[textgraphs.pipe.PipelineFactory]` +optional `PipelineFactory` used to configure components + + + +--- +#### [`create_pipeline` method](#textgraphs.TextGraphs.create_pipeline) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/doc.py#L103) + +```python +create_pipeline(text_input) +``` +Use the pipeline factory to create a pipeline (e.g., `spaCy.Document`) +for each text input, which are typically paragraph-length. + + * `text_input` : `str` +raw text to be parsed by this pipeline + + * *returns* : `textgraphs.pipe.Pipeline` +a configured pipeline + + + +--- +#### [`create_render` method](#textgraphs.TextGraphs.create_render) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/doc.py#L122) + +```python +create_render() +``` +Create an object for rendering the graph in `PyVis` HTML+JavaScript. + + * *returns* : `textgraphs.vis.RenderPyVis` +a configured `RenderPyVis` object for generating graph visualizations + + + +--- +#### [`collect_graph_elements` method](#textgraphs.TextGraphs.collect_graph_elements) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/doc.py#L381) + +```python +collect_graph_elements(pipe, text_id=0, para_id=0, debug=False) +``` +Collect the elements of a _lemma graph_ from the results of running +the `textgraph` algorithm. These elements include: parse dependencies, +lemmas, entities, and noun chunks. + +Make sure to call beforehand: `TextGraphs.create_pipeline()` + + * `pipe` : `textgraphs.pipe.Pipeline` +configured pipeline for this document + + * `text_id` : `int` +text (top-level document) identifier + + * `para_id` : `int` +paragraph identitifer + + * `debug` : `bool` +debugging flag + + + +--- +#### [`construct_lemma_graph` method](#textgraphs.TextGraphs.construct_lemma_graph) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/doc.py#L474) + +```python +construct_lemma_graph(debug=False) +``` +Construct the base level of the _lemma graph_ from the collected +elements. This gets represented in `NetworkX` as a directed graph +with parallel edges. + +Make sure to call beforehand: `TextGraphs.collect_graph_elements()` + + * `debug` : `bool` +debugging flag + + + +--- +#### [`perform_entity_linking` method](#textgraphs.TextGraphs.perform_entity_linking) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/doc.py#L534) + +```python +perform_entity_linking(pipe, debug=False) +``` +Perform _entity linking_ based on the `KnowledgeGraph` object. + +Make sure to call beforehand: `TextGraphs.collect_graph_elements()` + + * `pipe` : `textgraphs.pipe.Pipeline` +configured pipeline for this document + + * `debug` : `bool` +debugging flag + + + +--- +#### [`infer_relations` method](#textgraphs.TextGraphs.infer_relations) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/doc.py#L705) + +```python +infer_relations(pipe, debug=False) +``` +Gather triples representing inferred relations and build edges. + +Make sure to call beforehand: `TextGraphs.collect_graph_elements()` + + * `pipe` : `textgraphs.pipe.Pipeline` +configured pipeline for this document + + * `debug` : `bool` +debugging flag + + * *returns* : `typing.List[textgraphs.elem.Edge]` +a list of the inferred `Edge` objects + + + +--- +#### [`calc_phrase_ranks` method](#textgraphs.TextGraphs.calc_phrase_ranks) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/doc.py#L893) + +```python +calc_phrase_ranks(pr_alpha=0.85, debug=False) +``` +Calculate the weights for each node in the _lemma graph_, then +stack-rank the nodes so that entities have priority over lemmas. + +Phrase ranks are normalized to sum to 1.0 and these now represent +the ranked entities extracted from the document. + +Make sure to call beforehand: `TextGraphs.construct_lemma_graph()` + + * `pr_alpha` : `float` +optional `alpha` parameter for the PageRank algorithm + + * `debug` : `bool` +debugging flag + + + +--- +#### [`get_phrases` method](#textgraphs.TextGraphs.get_phrases) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/doc.py#L940) + +```python +get_phrases() +``` +Return the entities extracted from the document. + +Make sure to call beforehand: `TextGraphs.calc_phrase_ranks()` + + * *yields* : +extracted entities + + + +--- +#### [`get_phrases_as_df` method](#textgraphs.TextGraphs.get_phrases_as_df) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/doc.py#L973) + +```python +get_phrases_as_df() +``` +Return the ranked extracted entities as a dataframe. + +Make sure to call beforehand: `TextGraphs.calc_phrase_ranks()` + + * *returns* : `pandas.core.frame.DataFrame` +a `pandas.DataFrame` of the extracted entities + + + +--- +#### [`export_rdf` method](#textgraphs.TextGraphs.export_rdf) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/doc.py#L990) + +```python +export_rdf(lang="en") +``` +Extract the entities and relations which have IRIs as RDF triples. + + * `lang` : `str` +language identifier + + * *returns* : `str` +RDF triples N3 (Turtle) format as a string + + + +--- +#### [`denormalize_iri` method](#textgraphs.TextGraphs.denormalize_iri) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/doc.py#L1085) + +```python +denormalize_iri(uri_ref) +``` +Discern between a parsed entity and a linked entity. + + * *returns* : `str` +_lemma_key_ for a parsed entity, the full IRI for a linked entity + + + +--- +#### [`load_bootstrap_ttl` method](#textgraphs.TextGraphs.load_bootstrap_ttl) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/doc.py#L1103) + +```python +load_bootstrap_ttl(ttl_str, debug=False) +``` +Parse a TTL string with an RDF semantic graph representation to load +bootstrap definitions for the _lemma graph_ prior to parsing, e.g., +for synonyms. + + * `ttl_str` : `str` +RDF triples in TTL (Turtle/N3) format + + * `debug` : `bool` +debugging flag + + + +--- +#### [`export_kuzu` method](#textgraphs.TextGraphs.export_kuzu) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/doc.py#L1215) + +```python +export_kuzu(zip_name="lemma.zip", debug=False) +``` +Export a labeled property graph for KùzuDB (openCypher). + + * `debug` : `bool` +debugging flag + + * *returns* : `str` +name of the generated ZIP file + + + +## [`SimpleGraph` class](#SimpleGraph) + +An in-memory graph used to build a `MultiDiGraph` in NetworkX. + +--- +#### [`__init__` method](#textgraphs.SimpleGraph.__init__) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/graph.py#L31) + +```python +__init__() +``` +Constructor. + + + +--- +#### [`reset` method](#textgraphs.SimpleGraph.reset) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/graph.py#L42) + +```python +reset() +``` +Re-initialize the data structures, resetting all but the configuration. + + + +--- +#### [`make_node` method](#textgraphs.SimpleGraph.make_node) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/graph.py#L53) + +```python +make_node(tokens, key, span, kind, text_id, para_id, sent_id, label=None, length=1, linked=True) +``` +Lookup and return a `Node` object. +By default, link matching keys into the same node. +Otherwise instantiate a new node if it does not exist already. + + * `tokens` : `typing.List[textgraphs.elem.Node]` +list of parsed tokens + + * `key` : `str` +lemma key (invariant) + + * `span` : `spacy.tokens.token.Token` +token span for the parsed entity + + * `kind` : `` +the kind of this `Node` object + + * `text_id` : `int` +text (top-level document) identifier + + * `para_id` : `int` +paragraph identitifer + + * `sent_id` : `int` +sentence identifier + + * `label` : `typing.Optional[str]` +node label (for a new object) + + * `length` : `int` +length of token span + + * `linked` : `bool` +flag for whether this links to an entity + + * *returns* : `textgraphs.elem.Node` +the constructed `Node` object + + + +--- +#### [`make_edge` method](#textgraphs.SimpleGraph.make_edge) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/graph.py#L167) + +```python +make_edge(src_node, dst_node, kind, rel, prob, key=None, debug=False) +``` +Lookup an edge, creating a new one if it does not exist already, +and increment the count if it does. + + * `src_node` : `textgraphs.elem.Node` +source node in the triple + + * `dst_node` : `textgraphs.elem.Node` +destination node in the triple + + * `kind` : `` +the kind of this `Edge` object + + * `rel` : `str` +relation label + + * `prob` : `float` +probability of this `Edge` within the graph + + * `key` : `typing.Optional[str]` +lemma key (invariant); generate a key if this is not provided + + * `debug` : `bool` +debugging flag + + * *returns* : `typing.Optional[textgraphs.elem.Edge]` +the constructed `Edge` object; this may be `None` if the input parameters indicate skipping the edge + + + +--- +#### [`dump_lemma_graph` method](#textgraphs.SimpleGraph.dump_lemma_graph) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/graph.py#L236) + +```python +dump_lemma_graph() +``` +Dump the _lemma graph_ as a JSON string in _node-link_ format, +suitable for serialization and subsequent use in JavaScript, +Neo4j, Graphistry, etc. + +Make sure to call beforehand: `TextGraphs.calc_phrase_ranks()` + + * *returns* : `str` +a JSON representation of the exported _lemma graph_ in + + + +--- +#### [`load_lemma_graph` method](#textgraphs.SimpleGraph.load_lemma_graph) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/graph.py#L299) + +```python +load_lemma_graph(json_str, debug=False) +``` +Load from a JSON string in +a JSON representation of the exported _lemma graph_ in +[_node-link_](https://networkx.org/documentation/stable/reference/readwrite/json_graph.html) +format + + * `debug` : `bool` +debugging flag + + + +## [`Node` class](#Node) + +A data class representing one node, i.e., an extracted phrase. + +--- +#### [`__repr__` method](#textgraphs.Node.__repr__) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/dataclasses.py#L232) + +```python +__repr__() +``` + +--- +#### [`get_linked_label` method](#textgraphs.Node.get_linked_label) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/elem.py#L119) + +```python +get_linked_label() +``` +When this node has a linked entity, return that IRI. +Otherwise return its `label` value. + + * *returns* : `typing.Optional[str]` +a label for the linked entity + + + +--- +#### [`get_name` method](#textgraphs.Node.get_name) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/elem.py#L135) + +```python +get_name() +``` +Return a brief name for the graphical depiction of this Node. + + * *returns* : `str` +brief label to be used in a graph + + + +--- +#### [`get_stacked_count` method](#textgraphs.Node.get_stacked_count) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/elem.py#L152) + +```python +get_stacked_count() +``` +Return a modified count, to redact verbs and linked entities from +the stack-rank partitions. + + * *returns* : `int` +count, used for re-ranking extracted entities + + + +--- +#### [`get_pos` method](#textgraphs.Node.get_pos) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/elem.py#L168) + +```python +get_pos() +``` +Generate a position span for `OpenNRE`. + + * *returns* : `typing.Tuple[int, int]` +a position span needed for `OpenNRE` relation extraction + + + +## [`Edge` class](#Edge) + +A data class representing an edge between two nodes. + +--- +#### [`__repr__` method](#textgraphs.Edge.__repr__) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/dataclasses.py#L232) + +```python +__repr__() +``` + +## [`EnumBase` class](#EnumBase) + +A mixin for Enum codecs. + +## [`NodeEnum` class](#NodeEnum) + +Enumeration for the kinds of node categories + +## [`RelEnum` class](#RelEnum) + +Enumeration for the kinds of edge relations + +## [`PipelineFactory` class](#PipelineFactory) + +Factory pattern for building a pipeline, which is one of the more +expensive operations with `spaCy` + +--- +#### [`__init__` method](#textgraphs.PipelineFactory.__init__) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/pipe.py#L434) + +```python +__init__(spacy_model="en_core_web_sm", ner=None, kg=, infer_rels=[]) +``` +Constructor which instantiates the `spaCy` pipelines: + + * `tok_pipe` -- regular generator for parsed tokens + * `ner_pipe` -- with entities merged + * `aux_pipe` -- spotlight entity linking + +which will be needed for parsing and entity linking. + + * `spacy_model` : `str` +the specific model to use in `spaCy` pipelines + + * `ner` : `typing.Optional[textgraphs.pipe.Component]` +optional custom NER component + + * `kg` : `textgraphs.pipe.KnowledgeGraph` +knowledge graph used for entity linking + + * `infer_rels` : `typing.List[textgraphs.pipe.InferRel]` +a list of components for inferring relations + + + +--- +#### [`create_pipeline` method](#textgraphs.PipelineFactory.create_pipeline) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/pipe.py#L508) + +```python +create_pipeline(text_input) +``` +Instantiate the document pipelines needed to parse the input text. + + * `text_input` : `str` +raw text to be parsed + + * *returns* : `textgraphs.pipe.Pipeline` +a configured `Pipeline` object + + + +## [`Pipeline` class](#Pipeline) + +Manage parsing of a document, which is assumed to be paragraph-sized. + +--- +#### [`__init__` method](#textgraphs.Pipeline.__init__) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/pipe.py#L216) + +```python +__init__(text_input, tok_pipe, ner_pipe, aux_pipe, kg, infer_rels) +``` +Constructor. + + * `text_input` : `str` +raw text to be parsed + + * `tok_pipe` : `spacy.language.Language` +the `spaCy.Language` pipeline used for tallying individual tokens + + * `ner_pipe` : `spacy.language.Language` +the `spaCy.Language` pipeline used for tallying named entities + + * `aux_pipe` : `spacy.language.Language` +the `spaCy.Language` pipeline used for auxiliary components (e.g., `DBPedia Spotlight`) + + * `kg` : `textgraphs.pipe.KnowledgeGraph` +knowledge graph used for entity linking + + * `infer_rels` : `typing.List[textgraphs.pipe.InferRel]` +a list of components for inferring relations + + + +--- +#### [`get_lemma_key` classmethod](#textgraphs.Pipeline.get_lemma_key) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/pipe.py#L267) + +```python +get_lemma_key(span, placeholder=False) +``` +Compose a unique, invariant lemma key for the given span. + + * `span` : `typing.Union[spacy.tokens.span.Span, spacy.tokens.token.Token]` +span of tokens within the lemma + + * `placeholder` : `bool` +flag for whether to create a placeholder + + * *returns* : `str` +a composed lemma key + + + +--- +#### [`get_ent_lemma_keys` method](#textgraphs.Pipeline.get_ent_lemma_keys) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/pipe.py#L308) + +```python +get_ent_lemma_keys() +``` +Iterate through the fully qualified lemma keys for an extracted entity. + + * *yields* : +the lemma keys within an extracted entity + + + +--- +#### [`link_noun_chunks` method](#textgraphs.Pipeline.link_noun_chunks) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/pipe.py#L321) + +```python +link_noun_chunks(nodes, debug=False) +``` +Link any noun chunks which are not already subsumed by named entities. + + * `nodes` : `dict` +dictionary of `Node` objects in the graph + + * `debug` : `bool` +debugging flag + + * *returns* : `typing.List[textgraphs.elem.NounChunk]` +a list of identified noun chunks which are novel + + + +--- +#### [`iter_entity_pairs` method](#textgraphs.Pipeline.iter_entity_pairs) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/pipe.py#L373) + +```python +iter_entity_pairs(pipe_graph, max_skip, debug=True) +``` +Iterator for entity pairs for which the algorithm infers relations. + + * `pipe_graph` : `networkx.classes.multigraph.MultiGraph` +a `networkx.MultiGraph` representation of the graph, reused for graph algorithms + + * `max_skip` : `int` +maximum distance between entities for inferred relations + + * `debug` : `bool` +debugging flag + + * *yields* : +pairs of entities within a range, e.g., to use for relation extraction + + + +## [`Component` class](#Component) + +Abstract base class for a `spaCy` pipeline component. + +--- +#### [`augment_pipe` method](#textgraphs.Component.augment_pipe) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/pipe.py#L41) + +```python +augment_pipe(factory) +``` +Encapsulate a `spaCy` call to `add_pipe()` configuration. + + * `factory` : `PipelineFactory` +a `PipelineFactory` used to configure components + + + +## [`NERSpanMarker` class](#NERSpanMarker) + +Configures a `spaCy` pipeline component for `SpanMarkerNER` + +--- +#### [`__init__` method](#textgraphs.NERSpanMarker.__init__) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/ner.py#L22) + +```python +__init__(ner_model="tomaarsen/span-marker-roberta-large-ontonotes5") +``` +Constructor. + + * `ner_model` : `str` +model to be used in `SpanMarker` + + + +--- +#### [`augment_pipe` method](#textgraphs.NERSpanMarker.augment_pipe) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/ner.py#L36) + +```python +augment_pipe(factory) +``` +Encapsulate a `spaCy` call to `add_pipe()` configuration. + + * `factory` : `textgraphs.pipe.PipelineFactory` +the `PipelineFactory` used to configure this pipeline component + + + +## [`NounChunk` class](#NounChunk) + +A data class representing one noun chunk, i.e., a candidate as an extracted phrase. + +--- +#### [`__repr__` method](#textgraphs.NounChunk.__repr__) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/dataclasses.py#L232) + +```python +__repr__() +``` + +## [`KnowledgeGraph` class](#KnowledgeGraph) + +Base class for a _knowledge graph_ interface. + +--- +#### [`augment_pipe` method](#textgraphs.KnowledgeGraph.augment_pipe) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/pipe.py#L63) + +```python +augment_pipe(factory) +``` +Encapsulate a `spaCy` call to `add_pipe()` configuration. + + * `factory` : `PipelineFactory` +a `PipelineFactory` used to configure components + + + +--- +#### [`remap_ner` method](#textgraphs.KnowledgeGraph.remap_ner) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/pipe.py#L76) + +```python +remap_ner(label) +``` +Remap the OntoTypes4 values from NER output to more general-purpose IRIs. + + * `label` : `typing.Optional[str]` +input NER label, an `OntoTypes4` value + + * *returns* : `typing.Optional[str]` +an IRI for the named entity + + + +--- +#### [`normalize_prefix` method](#textgraphs.KnowledgeGraph.normalize_prefix) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/pipe.py#L92) + +```python +normalize_prefix(iri, debug=False) +``` +Normalize the given IRI to use standard namespace prefixes. + + * `iri` : `str` +input IRI, in fully-qualified domain representation + + * `debug` : `bool` +debugging flag + + * *returns* : `str` +the compact IRI representation, using an RDF namespace prefix + + + +--- +#### [`perform_entity_linking` method](#textgraphs.KnowledgeGraph.perform_entity_linking) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/pipe.py#L113) + +```python +perform_entity_linking(graph, pipe, debug=False) +``` +Perform _entity linking_ based on "spotlight" and other services. + + * `graph` : `textgraphs.graph.SimpleGraph` +source graph + + * `pipe` : `Pipeline` +configured pipeline for the current document + + * `debug` : `bool` +debugging flag + + + +--- +#### [`resolve_rel_iri` method](#textgraphs.KnowledgeGraph.resolve_rel_iri) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/pipe.py#L135) + +```python +resolve_rel_iri(rel, lang="en", debug=False) +``` +Resolve a `rel` string from a _relation extraction_ model which has +been trained on this knowledge graph. + + * `rel` : `str` +relation label, generation these source from Wikidata for many RE projects + + * `lang` : `str` +language identifier + + * `debug` : `bool` +debugging flag + + * *returns* : `typing.Optional[str]` +a resolved IRI + + + +## [`KGSearchHit` class](#KGSearchHit) + +A data class representing a hit from a _knowledge graph_ search. + +--- +#### [`__repr__` method](#textgraphs.KGSearchHit.__repr__) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/dataclasses.py#L232) + +```python +__repr__() +``` + +## [`KGWikiMedia` class](#KGWikiMedia) + +Manage access to WikiMedia-related APIs. + +--- +#### [`__init__` method](#textgraphs.KGWikiMedia.__init__) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/kg.py#L165) + +```python +__init__(spotlight_api="https://api.dbpedia-spotlight.org/en", dbpedia_search_api="https://lookup.dbpedia.org/api/search", dbpedia_sparql_api="https://dbpedia.org/sparql", wikidata_api="https://www.wikidata.org/w/api.php", ner_map=OrderedDict([('CARDINAL', {'iri': 'http://dbpedia.org/resource/Cardinal_number', 'definition': 'Numerals that do not fall under another type', 'label': 'cardinal number'}), ('DATE', {'iri': 'http://dbpedia.org/ontology/date', 'definition': 'Absolute or relative dates or periods', 'label': 'date'}), ('EVENT', {'iri': 'http://dbpedia.org/ontology/Event', 'definition': 'Named hurricanes, battles, wars, sports events, etc.', 'label': 'event'}), ('FAC', {'iri': 'http://dbpedia.org/ontology/Infrastructure', 'definition': 'Buildings, airports, highways, bridges, etc.', 'label': 'infrastructure'}), ('GPE', {'iri': 'http://dbpedia.org/ontology/Country', 'definition': 'Countries, cities, states', 'label': 'country'}), ('LANGUAGE', {'iri': 'http://dbpedia.org/ontology/Language', 'definition': 'Any named language', 'label': 'language'}), ('LAW', {'iri': 'http://dbpedia.org/ontology/Law', 'definition': 'Named documents made into laws', 'label': 'law'}), ('LOC', {'iri': 'http://dbpedia.org/ontology/Place', 'definition': 'Non-GPE locations, mountain ranges, bodies of water', 'label': 'place'}), ('MONEY', {'iri': 'http://dbpedia.org/resource/Money', 'definition': 'Monetary values, including unit', 'label': 'money'}), ('NORP', {'iri': 'http://dbpedia.org/ontology/nationality', 'definition': 'Nationalities or religious or political groups', 'label': 'nationality'}), ('ORDINAL', {'iri': 'http://dbpedia.org/resource/Ordinal_number', 'definition': 'Ordinal number, i.e., first, second, etc.', 'label': 'ordinal number'}), ('ORG', {'iri': 'http://dbpedia.org/ontology/Organisation', 'definition': 'Companies, agencies, institutions, etc.', 'label': 'organization'}), ('PERCENT', {'iri': 'http://dbpedia.org/resource/Percentage', 'definition': 'Percentage', 'label': 'percentage'}), ('PERSON', {'iri': 'http://dbpedia.org/ontology/Person', 'definition': 'People, including fictional', 'label': 'person'}), ('PRODUCT', {'iri': 'http://dbpedia.org/ontology/product', 'definition': 'Vehicles, weapons, foods, etc. (Not services)', 'label': 'product'}), ('QUANTITY', {'iri': 'http://dbpedia.org/resource/Quantity', 'definition': 'Measurements, as of weight or distance', 'label': 'quantity'}), ('TIME', {'iri': 'http://dbpedia.org/ontology/time', 'definition': 'Times smaller than a day', 'label': 'time'}), ('WORK OF ART', {'iri': 'http://dbpedia.org/resource/Work_of_art', 'definition': 'Titles of books, songs, etc.', 'label': 'work of art'})]), ns_prefix=OrderedDict([('dbc', 'http://dbpedia.org/resource/Category:'), ('dbt', 'http://dbpedia.org/resource/Template:'), ('dbr', 'http://dbpedia.org/resource/'), ('yago', 'http://dbpedia.org/class/yago/'), ('dbd', 'http://dbpedia.org/datatype/'), ('dbo', 'http://dbpedia.org/ontology/'), ('dbp', 'http://dbpedia.org/property/'), ('units', 'http://dbpedia.org/units/'), ('dbpedia-commons', 'http://commons.dbpedia.org/resource/'), ('dbpedia-wikicompany', 'http://dbpedia.openlinksw.com/wikicompany/'), ('dbpedia-wikidata', 'http://wikidata.dbpedia.org/resource/'), ('wd', 'http://www.wikidata.org/'), ('wd_ent', 'http://www.wikidata.org/entity/'), ('rdf', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'), ('schema', 'https://schema.org/'), ('owl', 'http://www.w3.org/2002/07/owl#')]), min_alias=0.8, min_similarity=0.9) +``` +Constructor. + + * `spotlight_api` : `str` +`DBPedia Spotlight` API or equivalent local service + + * `dbpedia_search_api` : `str` +`DBPedia Search` API or equivalent local service + + * `dbpedia_sparql_api` : `str` +`DBPedia SPARQL` API or equivalent local service + + * `wikidata_api` : `str` +`Wikidata Search` API or equivalent local service + + * `ner_map` : `dict` +named entity map for standardizing IRIs + + * `ns_prefix` : `dict` +RDF namespace prefixes + + * `min_alias` : `float` +minimum alias probability threshold for accepting linked entities + + * `min_similarity` : `float` +minimum label similarity threshold for accepting linked entities + + + +--- +#### [`augment_pipe` method](#textgraphs.KGWikiMedia.augment_pipe) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/kg.py#L219) + +```python +augment_pipe(factory) +``` +Encapsulate a `spaCy` call to `add_pipe()` configuration. + + * `factory` : `textgraphs.pipe.PipelineFactory` +a `PipelineFactory` used to configure components + + + +--- +#### [`remap_ner` method](#textgraphs.KGWikiMedia.remap_ner) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/kg.py#L237) + +```python +remap_ner(label) +``` +Remap the OntoTypes4 values from NER output to more general-purpose IRIs. + + * `label` : `typing.Optional[str]` +input NER label, an `OntoTypes4` value + + * *returns* : `typing.Optional[str]` +an IRI for the named entity + + + +--- +#### [`normalize_prefix` method](#textgraphs.KGWikiMedia.normalize_prefix) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/kg.py#L266) + +```python +normalize_prefix(iri, debug=False) +``` +Normalize the given IRI using the standard DBPedia namespace prefixes. + + * `iri` : `str` +input IRI, in fully-qualified domain representation + + * `debug` : `bool` +debugging flag + + * *returns* : `str` +the compact IRI representation, using an RDF namespace prefix + + + +--- +#### [`perform_entity_linking` method](#textgraphs.KGWikiMedia.perform_entity_linking) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/kg.py#L306) + +```python +perform_entity_linking(graph, pipe, debug=False) +``` +Perform _entity linking_ based on `DBPedia Spotlight` and other services. + + * `graph` : `textgraphs.graph.SimpleGraph` +source graph + + * `pipe` : `textgraphs.pipe.Pipeline` +configured pipeline for the current document + + * `debug` : `bool` +debugging flag + + + +--- +#### [`resolve_rel_iri` method](#textgraphs.KGWikiMedia.resolve_rel_iri) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/kg.py#L370) + +```python +resolve_rel_iri(rel, lang="en", debug=False) +``` +Resolve a `rel` string from a _relation extraction_ model which has +been trained on this _knowledge graph_, which defaults to using the +`WikiMedia` graphs. + + * `rel` : `str` +relation label, generation these source from Wikidata for many RE projects + + * `lang` : `str` +language identifier + + * `debug` : `bool` +debugging flag + + * *returns* : `typing.Optional[str]` +a resolved IRI + + + +--- +#### [`wikidata_search` method](#textgraphs.KGWikiMedia.wikidata_search) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/kg.py#L575) + +```python +wikidata_search(query, lang="en", debug=False) +``` +Query the Wikidata search API. + + * `query` : `str` +query string + + * `lang` : `str` +language identifier + + * `debug` : `bool` +debugging flag + + * *returns* : `typing.Optional[textgraphs.elem.KGSearchHit]` +search hit, if any + + + +--- +#### [`dbpedia_search_entity` method](#textgraphs.KGWikiMedia.dbpedia_search_entity) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/kg.py#L641) + +```python +dbpedia_search_entity(query, lang="en", debug=False) +``` +Perform a DBPedia API search. + + * `query` : `str` +query string + + * `lang` : `str` +language identifier + + * `debug` : `bool` +debugging flag + + * *returns* : `typing.Optional[textgraphs.elem.KGSearchHit]` +search hit, if any + + + +--- +#### [`dbpedia_sparql_query` method](#textgraphs.KGWikiMedia.dbpedia_sparql_query) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/kg.py#L738) + +```python +dbpedia_sparql_query(sparql, debug=False) +``` +Perform a SPARQL query on DBPedia. + + * `sparql` : `str` +SPARQL query string + + * `debug` : `bool` +debugging flag + + * *returns* : `dict` +dictionary of query results + + + +--- +#### [`dbpedia_wikidata_equiv` method](#textgraphs.KGWikiMedia.dbpedia_wikidata_equiv) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/kg.py#L791) + +```python +dbpedia_wikidata_equiv(dbpedia_iri, debug=False) +``` +Perform a SPARQL query on DBPedia to find an equivalent Wikidata entity. + + * `dbpedia_iri` : `str` +IRI in DBpedia + + * `debug` : `bool` +debugging flag + + * *returns* : `typing.Optional[str]` +equivalent IRI in Wikidata + + + +## [`LinkedEntity` class](#LinkedEntity) + +A data class representing one linked entity. + +--- +#### [`__repr__` method](#textgraphs.LinkedEntity.__repr__) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/dataclasses.py#L232) + +```python +__repr__() +``` + +## [`InferRel` class](#InferRel) + +Abstract base class for a _relation extraction_ model wrapper. + +--- +#### [`gen_triples_async` method](#textgraphs.InferRel.gen_triples_async) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/pipe.py#L188) + +```python +gen_triples_async(pipe, queue, debug=False) +``` +Infer relations as triples produced to a queue _concurrently_. + + * `pipe` : `Pipeline` +configured pipeline for the current document + + * `queue` : `asyncio.queues.Queue` +queue of inference tasks to be performed + + * `debug` : `bool` +debugging flag + + + +--- +#### [`gen_triples` method](#textgraphs.InferRel.gen_triples) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/pipe.py#L166) + +```python +gen_triples(pipe, debug=False) +``` +Infer relations as triples through a generator _iteratively_. + + * `pipe` : `Pipeline` +configured pipeline for the current document + + * `debug` : `bool` +debugging flag + + * *yields* : +generated triples + + + +## [`InferRel_OpenNRE` class](#InferRel_OpenNRE) + +Perform relation extraction based on the `OpenNRE` model. + + +--- +#### [`__init__` method](#textgraphs.InferRel_OpenNRE.__init__) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/rel.py#L33) + +```python +__init__(model="wiki80_cnn_softmax", max_skip=11, min_prob=0.9) +``` +Constructor. + + * `model` : `str` +the specific model to be used in `OpenNRE` + + * `max_skip` : `int` +maximum distance between entities for inferred relations + + * `min_prob` : `float` +minimum probability threshold for accepting an inferred relation + + + +--- +#### [`gen_triples` method](#textgraphs.InferRel_OpenNRE.gen_triples) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/rel.py#L58) + +```python +gen_triples(pipe, debug=False) +``` +Iterate on entity pairs to drive `OpenNRE`, inferring relations +represented as triples which get produced by a generator. + + * `pipe` : `textgraphs.pipe.Pipeline` +configured pipeline for the current document + + * `debug` : `bool` +debugging flag + + * *yields* : +generated triples as candidates for inferred relations + + + +## [`InferRel_Rebel` class](#InferRel_Rebel) + +Perform relation extraction based on the `REBEL` model. + + + +--- +#### [`__init__` method](#textgraphs.InferRel_Rebel.__init__) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/rel.py#L121) + +```python +__init__(lang="en_XX", mrebel_model="Babelscape/mrebel-large") +``` +Constructor. + + * `lang` : `str` +language identifier + + * `mrebel_model` : `str` +tokenizer model to be used + + + +--- +#### [`tokenize_sent` method](#textgraphs.InferRel_Rebel.tokenize_sent) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/rel.py#L145) + +```python +tokenize_sent(text) +``` +Apply the tokenizer manually, since we need to extract special tokens. + + * `text` : `str` +input text for the sentence to be tokenized + + * *returns* : `str` +extracted tokens + + + +--- +#### [`extract_triplets_typed` method](#textgraphs.InferRel_Rebel.extract_triplets_typed) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/rel.py#L174) + +```python +extract_triplets_typed(text) +``` +Parse the generated text and extract its triplets. + + * `text` : `str` +input text for the sentence to use in inference + + * *returns* : `list` +a list of extracted triples + + + +--- +#### [`gen_triples` method](#textgraphs.InferRel_Rebel.gen_triples) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/rel.py#L259) + +```python +gen_triples(pipe, debug=False) +``` +Drive `REBEL` to infer relations for each sentence, represented as +triples which get produced by a generator. + + * `pipe` : `textgraphs.pipe.Pipeline` +configured pipeline for the current document + + * `debug` : `bool` +debugging flag + + * *yields* : +generated triples as candidates for inferred relations + + + +## [`RenderPyVis` class](#RenderPyVis) + +Render the _lemma graph_ as a `PyVis` network. + +--- +#### [`__init__` method](#textgraphs.RenderPyVis.__init__) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/vis.py#L76) + +```python +__init__(graph, kg) +``` +Constructor. + + * `graph` : `textgraphs.graph.SimpleGraph` +source graph to be visualized + + * `kg` : `textgraphs.pipe.KnowledgeGraph` +knowledge graph used for entity linking + + + +--- +#### [`render_lemma_graph` method](#textgraphs.RenderPyVis.render_lemma_graph) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/vis.py#L94) + +```python +render_lemma_graph(debug=True) +``` +Prepare the structure of the `NetworkX` graph to use for building +and returning a `PyVis` network to render. + +Make sure to call beforehand: `TextGraphs.calc_phrase_ranks()` + + * `debug` : `bool` +debugging flag + + * *returns* : `pyvis.network.Network` +#L2) + +```python +__setattr__(name, value) +``` + +## [`GraphOfRelations` class](#GraphOfRelations) + +Attempt to reproduce results published in +"INGRAM: Inductive Knowledge Graph Embedding via Relation Graphs" + + +--- +#### [`__init__` method](#textgraphs.GraphOfRelations.__init__) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/gor.py#L100) + +```python +__init__(source) +``` +Constructor. + + * `source` : `textgraphs.graph.SimpleGraph` +source graph to be transformed + + + +--- +#### [`load_ingram` method](#textgraphs.GraphOfRelations.load_ingram) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/gor.py#L125) + +```python +load_ingram(json_file, debug=False) +``` +Load data for a source graph, as illustrated in _lee2023ingram_ + + * `json_file` : `pathlib.Path` +path for the JSON dataset to load + + * `debug` : `bool` +debugging flag + + + +--- +#### [`seeds` method](#textgraphs.GraphOfRelations.seeds) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/gor.py#L197) + +```python +seeds(debug=False) +``` +Prep data for the topological transform illustrated in _lee2023ingram_ + + * `debug` : `bool` +debugging flag + + + +--- +#### [`trace_source_graph` method](#textgraphs.GraphOfRelations.trace_source_graph) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/gor.py#L241) + +```python +trace_source_graph() +``` +Output a "seed" representation of the source graph. + + + +--- +#### [`construct_gor` method](#textgraphs.GraphOfRelations.construct_gor) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/gor.py#L311) + +```python +construct_gor(debug=False) +``` +Perform the topological transform described by _lee2023ingram_, +constructing a _graph of relations_ (GOR) and calculating +_affinity scores_ between entities in the GOR based on their +definitions: + +> we measure the affinity between two relations by considering how many +entities are shared between them and how frequently they share the same +entity + + * `debug` : `bool` +debugging flag + + + +--- +#### [`tally_frequencies` classmethod](#textgraphs.GraphOfRelations.tally_frequencies) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/gor.py#L348) + +```python +tally_frequencies(counter) +``` +Tally the frequency of shared entities. + + * `counter` : `collections.Counter` +`counter` data collection for the rel_b/entity pairs + + * *returns* : `int` +tallied values for one relation + + + +--- +#### [`get_affinity_scores` method](#textgraphs.GraphOfRelations.get_affinity_scores) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/gor.py#L401) + +```python +get_affinity_scores(debug=False) +``` +Reproduce metrics based on the example published in _lee2023ingram_ + + * `debug` : `bool` +debugging flag + + * *returns* : `typing.Dict[tuple, float]` +the calculated affinity scores + + + +--- +#### [`trace_metrics` method](#textgraphs.GraphOfRelations.trace_metrics) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/gor.py#L454) + +```python +trace_metrics(scores) +``` +Compare the calculated affinity scores with results from a published +example. + + * `scores` : `typing.Dict[tuple, float]` +the calculated affinity scores between pairs of relations (i.e., observed values) + + * *returns* : `pandas.core.frame.DataFrame` +a `pandas.DataFrame` where the rows compare expected vs. observed affinity scores + + + +--- +#### [`render_gor_plt` method](#textgraphs.GraphOfRelations.render_gor_plt) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/gor.py#L522) + +```python +render_gor_plt(scores) +``` +Visualize the _graph of relations_ using `matplotlib` + + * `scores` : `typing.Dict[tuple, float]` +the calculated affinity scores between pairs of relations (i.e., observed values) + + + +--- +#### [`render_gor_pyvis` method](#textgraphs.GraphOfRelations.render_gor_pyvis) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/gor.py#L563) + +```python +render_gor_pyvis(scores) +``` +Visualize the _graph of relations_ interactively using `PyVis` + + * `scores` : `typing.Dict[tuple, float]` +the calculated affinity scores between pairs of relations (i.e., observed values) + + * *returns* : `pyvis.network.Network` +a `pyvis.networkNetwork` representation of the transformed graph + + + +## [`TransArc` class](#TransArc) + +A data class representing one transformed rel-node-rel triple in +a _graph of relations_. + +--- +#### [`__repr__` method](#textgraphs.TransArc.__repr__) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/dataclasses.py#L232) + +```python +__repr__() +``` + +## [`RelDir` class](#RelDir) + +Enumeration for the directions of a relation. + +## [`SheafSeed` class](#SheafSeed) + +A data class representing a node from the source graph plus its +partial edge, based on a _Sheaf Theory_ decomposition of a graph. + +--- +#### [`__repr__` method](#textgraphs.SheafSeed.__repr__) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/dataclasses.py#L232) + +```python +__repr__() +``` + +## [`Affinity` class](#Affinity) + +A data class representing the affinity scores from one entity +in the transformed _graph of relations_. + +NB: there are much more efficient ways to calculate these +_affinity scores_ using sparse tensor algebra; this approach +illustrates the process -- for research and debugging. + +--- +#### [`__repr__` method](#textgraphs.Affinity.__repr__) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/dataclasses.py#L232) + +```python +__repr__() +``` + +--- +## [module functions](#textgraphs) +--- +#### [`calc_quantile_bins` function](#textgraphs.calc_quantile_bins) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/util.py#L65) + +```python +calc_quantile_bins(num_rows) +``` +Calculate the bins to use for a quantile stripe, +using [`numpy.linspace`](https://numpy.org/doc/stable/reference/generated/numpy.linspace.html) + + * `num_rows` : `int` +number of rows in the target dataframe + + * *returns* : `numpy.ndarray` +calculated bins, as a `numpy.ndarray` + + + +--- +#### [`get_repo_version` function](#textgraphs.get_repo_version) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/version.py#L50) + +```python +get_repo_version() +``` +Access the Git repository information and return items to identify +the version/commit running in production. + + * *returns* : `typing.Tuple[str, str]` +version tag and commit hash + + + +--- +#### [`root_mean_square` function](#textgraphs.root_mean_square) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/util.py#L116) + +```python +root_mean_square(values) +``` +Calculate the [*root mean square*](https://mathworld.wolfram.com/Root-Mean-Square.html) +of the values in the given list. + + * `values` : `typing.List[float]` +list of values to use in the RMS calculation + + * *returns* : `float` +RMS metric as a float + + + +--- +#### [`stripe_column` function](#textgraphs.stripe_column) +[*\[source\]*](https://github.com/DerwenAI/textgraphs/blob/main/textgraphs/util.py#L88) + +```python +stripe_column(values, bins) +``` +Stripe a column in a dataframe, by interpolating quantiles into a set of discrete indexes. + + * `values` : `list` +list of values to stripe + + * `bins` : `int` +quantile bins; see [`calc_quantile_bins()`](#calc_quantile_bins-function) + + * *returns* : `numpy.ndarray` +the striped column values, as a `numpy.ndarray` + + + +--- +## [module types](#textgraphs) diff --git a/docs/related.md b/docs/related.md new file mode 100644 index 0000000000000000000000000000000000000000..540884566f00f0b8da95099054a98dcaaeca7523 --- /dev/null +++ b/docs/related.md @@ -0,0 +1,82 @@ +Other projects have investigated related lines of inquiry, which help frame the problems encountered. + +[#loganlpgs19](biblio.md#loganlpgs19), + + - primary goal is to generate entities and facts from a KG + - emphasis on handling rare facts from a broad domain of topics and on improving perplexity + - "we are interested in LMs that dynamically decide the facts to incorporate from the KG, guided by the discourse" + - con: uses relatively simple `G = V,E` graph-theoretic notions of graph data, which is ostensibly RDF + - "traditional LMs are only capable of remembering facts seen at training time, and often have difficulty recalling them" + - introducing KGLM: enables the model to render information it has never seen before, as well as generate out-of-vocabulary tokens + - generates conditional probability of mapping an entity to a parsed token, based on previous tokens and entities within the same stream + - maintains a dynamically growing local KG, a subset of the KG that contains entities that have already been mentioned in the text, and their related entities + - "one of the primary barriers to incorporating factual knowledge into LMs is that training data is hard to obtain" + - provides the `Linked WikiText-2` dataset for running benchmarks, available on GitHub + - "For most LMs, it is difficult to control their generation since factual knowledge is entangled with generation capabilities of the model" + +> Standard language modeling corpora consist only of text, and thus are unable to describe which entities or facts each token is referring to. In contrast, while relation extraction datasets link text to a knowledge graph, the text is made up of disjoint sentences that do not provide sufficient context to train a powerful language model. + + +[#warmerdam2023pydata](biblio.md#warmerdam2023pydata), 20:35-ff + + - using `spaCy` to parse and annotate tokens with metadata + - parse trees => graph => heuristics to map from phrases to concepts + - `sense2vec` to find neighborhoods for surface forms (acronyms, synonyms, etc.) + - UMAP, etc. => hinting toward: "descriptive but not computable" + - UX: active learning vs. annotations of wrong examples using `prodigy` + - "spend more effort per example" => coining term _active teaching_ + - rethinking beyond the "optimality trap" + - "maybe familiarity is a liability in data analytics?" => doubt can be an advantage + + +[#wen2023mindmap](biblio.md#wen2023mindmap), + + - how to prompt LLMs with KGs + - "build a prompting pipeline that endows LLMs with the capability of comprehending KG inputs and inferring with a combined implicit knowledge and the retrieved external knowledge" + - in contrast, the _prompt engineering_ paradigm: "pre-train, prompt, and predict" + - "goal of this work is to build a plug-and-play prompting approach to elicit the graph-of-thoughts reasoning capability in LLMs" + 1.consolidates the retrieved facts from KGs and the implicit knowledge from LLMs + 2. discovers new patterns in input KGs + 3. reasons over the mind map to yield final outputs + - build multiple _evidence sub-graphs_ which get aggregated into _reasoning graphs_, then prompt LLMs and build a _mind map_ to explain the reasoning process + - conjecture that LLMs can comprehend and extract knowledge from a reasoning graph that is described by natural language + - prompting a GPT-3.5 with `MindMap` yields an overwhelming performance over GPT-4 consistently + + +[#tripathi2024deepnlp](biblio.md#tripathi2024deepnlp), + +["Deep NLP on SF Literature"](https://github.com/kkrishna24/deep_nlp_on_sf_literature) +**Krishna Tripathi** _GitHub_ (2024-01-25) + + - processes texts using customized methods, NLTK, and spaCy + - performs domain-specific named entity recognition in multiple stages + - fine-tunes a RoBERTa model using GPT to generate annotated data + - implements multicore LDA for efficient topic modeling and theme-extraction + - modularized code makes this work highly reusable for other domain-specific literature tasks: code can be easily refitted for legal datasets, a corpus of classics etc. + - goes the additional step of using these results to **rework training data** and train models + + +[#nayak2023tds](biblio.md#nayak2023tds) + +["How to Convert Any Text Into a Graph of Concepts"](https://towardsdatascience.com/how-to-convert-any-text-into-a-graph-of-concepts-110844f22a1a) +**Rahul Nayak**, _Towards Data Science_ (2023-11-09) + + - "a method to convert any text corpus into a _graph of concepts_" (aka KG) + - use KGs to implement RAG and "chat with our documents" + - Q: is this work solid enough to cite in an academic paper?? + + + +## counterexamples + +[#nizami2023llm](biblio.md#nizami2023llm) + +["Extracting Relation from Sentence using LLM"](https://medium.com/@nizami_muhammad/extracting-relation-from-sentence-using-llm-597d0c0310a8) +**Muhammad Nizami** _Medium_ (2023-11-15) + + +[#lawrence2024ttg](biblio.md#lawrence2024ttg) + +["Text-to-Graph via LLM: pre-training, prompting, or tuning?"](https://medium.com/@peter.lawrence_47665/text-to-graph-via-llm-pre-training-prompting-or-tuning-3233d1165360) +**Peter Lawrence** _Medium_ (2024-01-16) + diff --git a/docs/rubric.md b/docs/rubric.md new file mode 100644 index 0000000000000000000000000000000000000000..9157d3a2c4aafb8559316ade6f43d4efea684334 --- /dev/null +++ b/docs/rubric.md @@ -0,0 +1,55 @@ +# Appendix: ML OSS Evaluation Rubric + +The following checklist provides an evaluation rubric for open source code related to machine learning research. +For any given code repository, tally a score based on these questions: + + - **Q1:** Does the repository use a business-friendly license? + - **Q2:** Does the code install correctly with either `pip` or `conda` package managers? + - **Q3:** Are the library dependencies reasonably current, not using pinned versions for popular libraries? + - **Q4:** Has the project provided sample code which runs without exceptions? + - **Q5:** Can the sample code reproduce the published results of the research? + - **Q6:** Does the library provide affordances for data integration, i.e., it's not optimized for a particular benchmark? + - **Q7:** Can the code be called programmatically as a library, i.e., not run primarily through a command line interface (CLI), and not requiring container/microservice orchestration? + - **Q8:** Will the library and its dependencies pass a reasonable level of security audit without structural changes? + - **Q9:** Does the code support concurrency and parallelization? + - **Q10:** Has the repo been maintained within the past six months? + + +## Dependency Evaluations + +Based on this checklist, the dependencies integrated within this project scores as follows: + +rubric | `OpenNRE` | `pulp` | `qwikidata` | `REBEL` | `spaCy` | `Spotlight` | `SpanMarker` | `transformers` +--- | --- | --- | --- | --- | --- | --- | --- +Q1 | x | x | x | x | x | x | x | x +Q2 | x | x | x | x | x | x | x | x +Q3 | x | x | x | x | x | x | x | x +Q4 | x | x | x | x | x | x | x | x +Q5 | x | x | x | x | x | x | x | x +Q6 | x | x | x | x | x | x | x | x +Q7 | x | x | x | x | x | x | x | x +Q8 | x | x | x | x | x | x | x | x +Q9 | x | x | x | x | x | x | x | x +Q10 | x | x | x | x | x | x | x | x + + +[`OpenNRE`](https://github.com/thunlp/OpenNRE/) + +[`pulp`](https://github.com/coin-or/pulp) + +[`qwikidata`](https://github.com/kensho-technologies/qwikidata) + +[`REBEL`](https://github.com/Babelscape/rebel) + +[`spaCy`](https://spacy.io/) + +[`spaCy-DBpedia-Spotlight`](https://github.com/MartinoMensio/spacy-dbpedia-spotlight) + +[`SpanMarker`](https://github.com/tomaarsen/SpanMarkerNER/) + +[`transformers`](https://github.com/huggingface/transformers/) + + +There were many other open source code projects which were evaluated +but scored < 8 and were therefore considered unusable for our work. + diff --git a/docs/start.md b/docs/start.md new file mode 100644 index 0000000000000000000000000000000000000000..86235d1f893a7b9948c6a794267272084a136578 --- /dev/null +++ b/docs/start.md @@ -0,0 +1,23 @@ +# Getting Started + +Video Tutorial by artworkbean from the Noun Project + +## Installation + +Install from [PyPi](https://pypi.python.org/pypi/textgraphs): + +```bash +python3 -m pip install -U textgraphs +``` + +## Sample Usage + +Run the demos locally: + +```bash +python3 demo.py +``` + +```bash +streamlit run app.py +``` diff --git a/docs/strategy.md b/docs/strategy.md new file mode 100644 index 0000000000000000000000000000000000000000..9b04d667db1188d7a702827c9b549c38948621e9 --- /dev/null +++ b/docs/strategy.md @@ -0,0 +1,23 @@ +Consider the recent use of _direct preference optimization_ (DPO) with open source tools such as `Argilla` and `Distilabel` to identify and fix data quality issues in the `Zephyr-7B-beta` dataset. This resulted in the `Notus-7B-v1` model, which was created by a relatively small R&D team -- "GPU-poor" -- and then gained high ranking on the Hugging Face leaderboards. + + - + - + +Andrew Ng: + + +> While it's always nice to have massive numbers of NVIDIA H100 or AMD MI300X GPUs, this work is another illustration — out of many, I want to emphasize — that deep thinking with only modest computational resources can carry you far. + +"Direct Preference Optimization: Your Language Model is Secretly a Reward Model" +Rafael Rafailov, et al. + + +RE projects in particular tend to use Wikidata _labels_ (not IRIs) to train models; these are descriptive but not computable + +Components such as NER and RE could be enhanced by reworking the data quality for training data, benchmarks, evals, etc. + + - `SpanMarker` provides a framework for iteration on NER, to fine-tune for specific KGs + + - `OpenNRE` provides a framework for iteration on RE, to fine-tune for specific KGs + +Data-first iterations on these components can take advantage of DPO, sparse fine-tuning, pruning, quantization, and so on, while the _lemma graph_ plus its topological transforms provide enhanced tokenization and better context for training. diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css new file mode 100644 index 0000000000000000000000000000000000000000..7053c5718a5918fa28499225a15c04a47fcb9b8f --- /dev/null +++ b/docs/stylesheets/extra.css @@ -0,0 +1,13 @@ +.md-typeset a { + color: hsl(66deg 100% 31%); +} + +.md-typeset a:focus, .md-typeset a:hover { + color: hsl(306, 45%, 57%); +} + +:root { + --md-primary-fg-color: hsl(65, 46%, 58%); + --md-primary-fg-color--light: #000; + --md-primary-fg-color--dark: #FFF; +} \ No newline at end of file diff --git a/docs/topo.md b/docs/topo.md new file mode 100644 index 0000000000000000000000000000000000000000..487f0ad3000c08feb8c955ddc1b9df19519f86b1 --- /dev/null +++ b/docs/topo.md @@ -0,0 +1,7 @@ +**TODO**: summarize from + +Graph topological transform approaches so far (e.g., `lee2023ingram`) have focused on using relation affinities to train _representation learning_ models. this may be another example of using deep learning as a mêlée weapon. instead, + +results computed from _graph of relations_ analysis naturally feed into _statistical relational learning_ approaches such as _probabilistic soft logic_, to develop rule sets and ground truth for training SRE models. + +TODO: survey/compare topological decomposition of graphs, then using statistics to determine how to reconstruct probabilistically => for recomposition of generate graph elements (not simple nodes, edges) diff --git a/docs/tutorial.md b/docs/tutorial.md new file mode 100644 index 0000000000000000000000000000000000000000..ea110f5851791ad611e71cb2816b7a6b3bcef42a --- /dev/null +++ b/docs/tutorial.md @@ -0,0 +1,25 @@ +# Tutorial Syllabus + +Video Tutorial by artworkbean from the Noun Project + +Coding samples in the following notebooks help illustrate the use +of **TextGraphs** and related libraries in Python. + + +## Audience + + * You are a Python programmer who needs to learn how to leverage LLM-augmented workflows to construct KGs + * You are an ML engineer who needs to understand how to integrate LLM research results into production-quality apps + +## Prerequisites + + * Some coding experience in Python (you can read a 20-line program) + * Some familiarity with ML, specifically with LLM applications + * Interest in use cases that need to use NLP to construct KGs + + +## Key Takeaways + + * Hands-on experience with popular open source libraries in Python for natural language at the intersection of LLMs and KGs + * Coding examples that can be used as starting points for your own related projects + * Ways to integrate natural language work with other aspects of graph data science diff --git a/examples/ex0_0.ipynb b/examples/ex0_0.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..699ee0ae4ea895317e89b3c2e501136d0b188af8 --- /dev/null +++ b/examples/ex0_0.ipynb @@ -0,0 +1,1689 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "c32bf0b9-1445-4ede-ae49-7dd63ff3b08e", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:41:43.180489Z", + "iopub.status.busy": "2024-01-17T01:41:43.179719Z", + "iopub.status.idle": "2024-01-17T01:41:43.199483Z", + "shell.execute_reply": "2024-01-17T01:41:43.194882Z", + "shell.execute_reply.started": "2024-01-17T01:41:43.180434Z" + } + }, + "outputs": [], + "source": [ + "# for use in tutorial and development; do not include this `sys.path` change in production:\n", + "import sys ; sys.path.insert(0, \"../\")" + ] + }, + { + "cell_type": "markdown", + "id": "c8ff5d81-110c-42ae-8aa7-ed4fffea40c6", + "metadata": {}, + "source": [ + "# demo: TextGraphs + LLMs to construct a 'lemma graph'" + ] + }, + { + "cell_type": "markdown", + "id": "1e847d0a-bc6c-470a-9fef-620ebbdbbbc3", + "metadata": {}, + "source": [ + "_TextGraphs_ library is intended for iterating through a sequence of paragraphs." + ] + }, + { + "cell_type": "markdown", + "id": "61d8d39a-23e4-48e7-b8f4-0dd724ccf586", + "metadata": {}, + "source": [ + "## environment" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "22489527-2ad5-4e3c-be23-f511e6bcf69f", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:41:43.205321Z", + "iopub.status.busy": "2024-01-17T01:41:43.204828Z", + "iopub.status.idle": "2024-01-17T01:41:51.202960Z", + "shell.execute_reply": "2024-01-17T01:41:51.201428Z", + "shell.execute_reply.started": "2024-01-17T01:41:43.205291Z" + }, + "scrolled": true + }, + "outputs": [], + "source": [ + "from IPython.display import display, HTML, Image, SVG\n", + "import pathlib\n", + "import typing\n", + "\n", + "from icecream import ic\n", + "from pyinstrument import Profiler\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "import pyvis\n", + "import spacy\n", + "\n", + "import textgraphs" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "438f5775-487b-493e-a172-59b652b94955", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:41:51.205309Z", + "iopub.status.busy": "2024-01-17T01:41:51.204860Z", + "iopub.status.idle": "2024-01-17T01:41:51.226390Z", + "shell.execute_reply": "2024-01-17T01:41:51.225503Z", + "shell.execute_reply.started": "2024-01-17T01:41:51.205274Z" + } + }, + "outputs": [], + "source": [ + "%load_ext watermark" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "adc052dd-5cca-4d11-b543-3f0999f4f883", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:41:51.228636Z", + "iopub.status.busy": "2024-01-17T01:41:51.228357Z", + "iopub.status.idle": "2024-01-17T01:41:51.282369Z", + "shell.execute_reply": "2024-01-17T01:41:51.281284Z", + "shell.execute_reply.started": "2024-01-17T01:41:51.228610Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Last updated: 2024-01-16T17:41:51.229985-08:00\n", + "\n", + "Python implementation: CPython\n", + "Python version : 3.10.11\n", + "IPython version : 8.20.0\n", + "\n", + "Compiler : Clang 13.0.0 (clang-1300.0.29.30)\n", + "OS : Darwin\n", + "Release : 21.6.0\n", + "Machine : x86_64\n", + "Processor : i386\n", + "CPU cores : 8\n", + "Architecture: 64bit\n", + "\n" + ] + } + ], + "source": [ + "%watermark" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "6e4618da-daf9-44c9-adbb-e5781dba5504", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:41:51.291126Z", + "iopub.status.busy": "2024-01-17T01:41:51.287449Z", + "iopub.status.idle": "2024-01-17T01:41:51.322186Z", + "shell.execute_reply": "2024-01-17T01:41:51.320908Z", + "shell.execute_reply.started": "2024-01-17T01:41:51.291072Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "sys : 3.10.11 (v3.10.11:7d4cc5aa85, Apr 4 2023, 19:05:19) [Clang 13.0.0 (clang-1300.0.29.30)]\n", + "spacy : 3.7.2\n", + "pandas : 2.1.4\n", + "matplotlib: 3.8.2\n", + "textgraphs: 0.5.0\n", + "pyvis : 0.3.2\n", + "\n" + ] + } + ], + "source": [ + "%watermark --iversions" + ] + }, + { + "cell_type": "markdown", + "id": "1a04e3dc-57d8-43a4-a342-cc38b86fc6a6", + "metadata": {}, + "source": [ + "## parse a document" + ] + }, + { + "cell_type": "markdown", + "id": "7c567afd-2f44-4391-899a-da6aba3d222e", + "metadata": {}, + "source": [ + "provide the source text" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "630430c5-21dc-4897-9a4b-3b01baf3de17", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:41:51.326474Z", + "iopub.status.busy": "2024-01-17T01:41:51.325657Z", + "iopub.status.idle": "2024-01-17T01:41:51.334443Z", + "shell.execute_reply": "2024-01-17T01:41:51.332925Z", + "shell.execute_reply.started": "2024-01-17T01:41:51.326405Z" + } + }, + "outputs": [], + "source": [ + "SRC_TEXT: str = \"\"\" \n", + "Werner Herzog is a remarkable filmmaker and an intellectual originally from Germany, the son of Dietrich Herzog.\n", + "After the war, Werner fled to America to become famous.\n", + "\"\"\"" + ] + }, + { + "cell_type": "markdown", + "id": "01152885-f301-49b1-ab61-f5b19d81c036", + "metadata": {}, + "source": [ + "set up the statistical stack profiling" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "2a289117-301d-4027-ae1b-200201fb5f93", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:41:51.346396Z", + "iopub.status.busy": "2024-01-17T01:41:51.346074Z", + "iopub.status.idle": "2024-01-17T01:41:51.352763Z", + "shell.execute_reply": "2024-01-17T01:41:51.350319Z", + "shell.execute_reply.started": "2024-01-17T01:41:51.346368Z" + } + }, + "outputs": [], + "source": [ + "profiler: Profiler = Profiler()\n", + "profiler.start()" + ] + }, + { + "cell_type": "markdown", + "id": "bf9d4f99-b82b-4d11-a9a4-31d0337f4aa8", + "metadata": {}, + "source": [ + "set up the `TextGraphs` pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "da6fcb0f-b2ac-4f74-af39-2c129c750cab", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:41:51.357183Z", + "iopub.status.busy": "2024-01-17T01:41:51.354882Z", + "iopub.status.idle": "2024-01-17T01:42:10.886781Z", + "shell.execute_reply": "2024-01-17T01:42:10.884253Z", + "shell.execute_reply.started": "2024-01-17T01:41:51.357081Z" + } + }, + "outputs": [], + "source": [ + "tg: textgraphs.TextGraphs = textgraphs.TextGraphs(\n", + " factory = textgraphs.PipelineFactory(\n", + " spacy_model = textgraphs.SPACY_MODEL,\n", + " ner = None,\n", + " kg = textgraphs.KGWikiMedia(\n", + " spotlight_api = textgraphs.DBPEDIA_SPOTLIGHT_API,\n", + " dbpedia_search_api = textgraphs.DBPEDIA_SEARCH_API,\n", + " dbpedia_sparql_api = textgraphs.DBPEDIA_SPARQL_API,\n", + " \t\twikidata_api = textgraphs.WIKIDATA_API,\n", + " min_alias = textgraphs.DBPEDIA_MIN_ALIAS,\n", + " min_similarity = textgraphs.DBPEDIA_MIN_SIM,\n", + " ),\n", + " infer_rels = [\n", + " \t\ttextgraphs.InferRel_OpenNRE(\n", + " model = textgraphs.OPENNRE_MODEL,\n", + " max_skip = textgraphs.MAX_SKIP,\n", + " min_prob = textgraphs.OPENNRE_MIN_PROB,\n", + " \t\t),\n", + " textgraphs.InferRel_Rebel(\n", + " lang = \"en_XX\",\n", + " mrebel_model = textgraphs.MREBEL_MODEL,\n", + " ),\n", + " ],\n", + " ),\n", + ")\n", + "\n", + "pipe: textgraphs.Pipeline = tg.create_pipeline(\n", + " SRC_TEXT.strip(),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "8b71b841-0cf5-4cc6-af4c-c85344b8f6c5", + "metadata": {}, + "source": [ + "## visualize the parse results" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "5901a49e-3f90-4061-9c3a-e9d1f05b40f3", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:42:10.892508Z", + "iopub.status.busy": "2024-01-17T01:42:10.891377Z", + "iopub.status.idle": "2024-01-17T01:42:10.925630Z", + "shell.execute_reply": "2024-01-17T01:42:10.921355Z", + "shell.execute_reply.started": "2024-01-17T01:42:10.892351Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " Werner Herzog\n", + " PERSON\n", + "\n", + " is a remarkable filmmaker and an intellectual originally from \n", + "\n", + " Germany\n", + " GPE\n", + "\n", + ", the son of \n", + "\n", + " Dietrich Herzog\n", + " PERSON\n", + "\n", + ".
After the war, \n", + "\n", + " Werner\n", + " PERSON\n", + "\n", + " fled to \n", + "\n", + " America\n", + " GPE\n", + "\n", + " to become famous.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "spacy.displacy.render(\n", + " pipe.ner_doc,\n", + " style = \"ent\",\n", + " jupyter = True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "ffc0863d-5ed4-4857-aee1-96f26472f1ef", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:42:10.929432Z", + "iopub.status.busy": "2024-01-17T01:42:10.928841Z", + "iopub.status.idle": "2024-01-17T01:42:10.974738Z", + "shell.execute_reply": "2024-01-17T01:42:10.973574Z", + "shell.execute_reply.started": "2024-01-17T01:42:10.929374Z" + } + }, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + " Werner Herzog\n", + " PROPN\n", + "\n", + "\n", + "\n", + " is\n", + " AUX\n", + "\n", + "\n", + "\n", + " a\n", + " DET\n", + "\n", + "\n", + "\n", + " remarkable\n", + " ADJ\n", + "\n", + "\n", + "\n", + " filmmaker\n", + " NOUN\n", + "\n", + "\n", + "\n", + " and\n", + " CCONJ\n", + "\n", + "\n", + "\n", + " an\n", + " DET\n", + "\n", + "\n", + "\n", + " intellectual\n", + " NOUN\n", + "\n", + "\n", + "\n", + " originally\n", + " ADV\n", + "\n", + "\n", + "\n", + " from\n", + " ADP\n", + "\n", + "\n", + "\n", + " Germany,\n", + " PROPN\n", + "\n", + "\n", + "\n", + " the\n", + " DET\n", + "\n", + "\n", + "\n", + " son\n", + " NOUN\n", + "\n", + "\n", + "\n", + " of\n", + " ADP\n", + "\n", + "\n", + "\n", + " Dietrich Herzog.\n", + " PUNCT\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " SPACE\n", + "\n", + "\n", + "\n", + " After\n", + " ADP\n", + "\n", + "\n", + "\n", + " the\n", + " DET\n", + "\n", + "\n", + "\n", + " war,\n", + " NOUN\n", + "\n", + "\n", + "\n", + " Werner\n", + " PROPN\n", + "\n", + "\n", + "\n", + " fled\n", + " VERB\n", + "\n", + "\n", + "\n", + " to\n", + " ADP\n", + "\n", + "\n", + "\n", + " America\n", + " PROPN\n", + "\n", + "\n", + "\n", + " to\n", + " PART\n", + "\n", + "\n", + "\n", + " become\n", + " VERB\n", + "\n", + "\n", + "\n", + " famous.\n", + " ADJ\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " nsubj\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " det\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " amod\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " attr\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " cc\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " det\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " conj\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " advmod\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " prep\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " pobj\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " det\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " appos\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " prep\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " punct\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " dep\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " prep\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " det\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " pobj\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " nsubj\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " prep\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " pobj\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " aux\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " advcl\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " acomp\n", + " \n", + " \n", + "\n", + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "parse_svg: str = spacy.displacy.render(\n", + " pipe.ner_doc,\n", + " style = \"dep\",\n", + " jupyter = False,\n", + ")\n", + "\n", + "display(SVG(parse_svg))" + ] + }, + { + "cell_type": "markdown", + "id": "5e9de8e0-5a79-45f9-8c9d-6c68c560040e", + "metadata": {}, + "source": [ + "## collect graph elements from the parse" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "4d5abe40-d483-44f5-a747-92e0ac9c8b0d", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:42:10.978005Z", + "iopub.status.busy": "2024-01-17T01:42:10.977288Z", + "iopub.status.idle": "2024-01-17T01:42:10.985871Z", + "shell.execute_reply": "2024-01-17T01:42:10.984706Z", + "shell.execute_reply.started": "2024-01-17T01:42:10.977922Z" + } + }, + "outputs": [], + "source": [ + "tg.collect_graph_elements(\n", + " pipe,\n", + " debug = False,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "7c440db4-fc01-44ff-8d8d-03517cc1f1e4", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:42:10.989542Z", + "iopub.status.busy": "2024-01-17T01:42:10.988271Z", + "iopub.status.idle": "2024-01-17T01:42:11.551822Z", + "shell.execute_reply": "2024-01-17T01:42:11.551011Z", + "shell.execute_reply.started": "2024-01-17T01:42:10.989493Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ic| len(tg.nodes.values()): 36\n", + "ic| len(tg.edges.values()): 42\n" + ] + } + ], + "source": [ + "ic(len(tg.nodes.values()));\n", + "ic(len(tg.edges.values()));" + ] + }, + { + "cell_type": "markdown", + "id": "76caa0e6-351a-48e8-9e1f-94a31d612ee4", + "metadata": {}, + "source": [ + "## perform entity linking" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "6d23e215-9d8c-4e03-8040-fa9398fad62b", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:42:11.553477Z", + "iopub.status.busy": "2024-01-17T01:42:11.553267Z", + "iopub.status.idle": "2024-01-17T01:42:32.304619Z", + "shell.execute_reply": "2024-01-17T01:42:32.302739Z", + "shell.execute_reply.started": "2024-01-17T01:42:11.553444Z" + } + }, + "outputs": [], + "source": [ + "tg.perform_entity_linking(\n", + " pipe,\n", + " debug = False,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "f7e31cf4-0f49-4fef-affa-04c9833a6236", + "metadata": {}, + "source": [ + "## infer relations" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "32bb75af-e806-4334-a876-127f2704ffbf", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:42:32.311135Z", + "iopub.status.busy": "2024-01-17T01:42:32.310408Z", + "iopub.status.idle": "2024-01-17T01:42:46.741855Z", + "shell.execute_reply": "2024-01-17T01:42:46.740354Z", + "shell.execute_reply.started": "2024-01-17T01:42:32.311083Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[Edge(src_node=0, dst_node=10, kind=, rel='https://schema.org/nationality', prob=1.0, count=1),\n", + " Edge(src_node=15, dst_node=0, kind=, rel='https://schema.org/children', prob=1.0, count=1),\n", + " Edge(src_node=27, dst_node=22, kind=, rel='https://schema.org/event', prob=1.0, count=1)]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inferred_edges: list = await tg.infer_relations_async(\n", + " pipe,\n", + " debug = False,\n", + ")\n", + "\n", + "inferred_edges" + ] + }, + { + "cell_type": "markdown", + "id": "76fa3fcb-6432-4ed5-80d1-569be4253e6e", + "metadata": {}, + "source": [ + "## construct a lemma graph" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "79efb0d1-dfc4-4f45-8c4e-b42a080832e7", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:42:46.744612Z", + "iopub.status.busy": "2024-01-17T01:42:46.744082Z", + "iopub.status.idle": "2024-01-17T01:42:46.752790Z", + "shell.execute_reply": "2024-01-17T01:42:46.751990Z", + "shell.execute_reply.started": "2024-01-17T01:42:46.744560Z" + } + }, + "outputs": [], + "source": [ + "tg.construct_lemma_graph(\n", + " debug = False,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "84a4b0c6-ebd5-4794-ac2d-ee191ab7ed0b", + "metadata": {}, + "source": [ + "## extract ranked entities" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "70134eb6-c1b4-474e-81cd-12b6b7f38afd", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:42:46.756709Z", + "iopub.status.busy": "2024-01-17T01:42:46.754800Z", + "iopub.status.idle": "2024-01-17T01:42:47.059654Z", + "shell.execute_reply": "2024-01-17T01:42:47.058466Z", + "shell.execute_reply.started": "2024-01-17T01:42:46.756630Z" + }, + "scrolled": true + }, + "outputs": [], + "source": [ + "tg.calc_phrase_ranks(\n", + " pr_alpha = textgraphs.PAGERANK_ALPHA,\n", + " debug = False,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "1ba5b734-665a-4bc0-9eca-11b2ba074fed", + "metadata": {}, + "source": [ + "show the resulting entities extracted from the document" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "a77a0ede-2225-47c1-8ea8-4ae2220aa086", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:42:47.062142Z", + "iopub.status.busy": "2024-01-17T01:42:47.061624Z", + "iopub.status.idle": "2024-01-17T01:42:47.098472Z", + "shell.execute_reply": "2024-01-17T01:42:47.097234Z", + "shell.execute_reply.started": "2024-01-17T01:42:47.062101Z" + }, + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
node_idtextposlabelcountweight
00Werner HerzogPROPNdbr:Werner_Herzog10.080547
110GermanyPROPNdbr:Germany10.080437
215Dietrich HerzogPROPNdbo:Person10.079048
327AmericaPROPNdbr:United_States10.079048
424WernerPROPNdbo:Person10.077633
54filmmakerNOUNowl:Thing10.076309
622warNOUNowl:Thing10.076309
732a remarkable filmmakernoun_chunkNone10.076077
87intellectualNOUNowl:Thing10.074725
913sonNOUNowl:Thing10.074725
1033an intellectualnoun_chunkNone10.074606
1134the sonnoun_chunkNone10.074606
1235the warnoun_chunkNone10.074606
\n", + "
" + ], + "text/plain": [ + " node_id text pos label count \\\n", + "0 0 Werner Herzog PROPN dbr:Werner_Herzog 1 \n", + "1 10 Germany PROPN dbr:Germany 1 \n", + "2 15 Dietrich Herzog PROPN dbo:Person 1 \n", + "3 27 America PROPN dbr:United_States 1 \n", + "4 24 Werner PROPN dbo:Person 1 \n", + "5 4 filmmaker NOUN owl:Thing 1 \n", + "6 22 war NOUN owl:Thing 1 \n", + "7 32 a remarkable filmmaker noun_chunk None 1 \n", + "8 7 intellectual NOUN owl:Thing 1 \n", + "9 13 son NOUN owl:Thing 1 \n", + "10 33 an intellectual noun_chunk None 1 \n", + "11 34 the son noun_chunk None 1 \n", + "12 35 the war noun_chunk None 1 \n", + "\n", + " weight \n", + "0 0.080547 \n", + "1 0.080437 \n", + "2 0.079048 \n", + "3 0.079048 \n", + "4 0.077633 \n", + "5 0.076309 \n", + "6 0.076309 \n", + "7 0.076077 \n", + "8 0.074725 \n", + "9 0.074725 \n", + "10 0.074606 \n", + "11 0.074606 \n", + "12 0.074606 " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df: pd.DataFrame = tg.get_phrases_as_df()\n", + "df" + ] + }, + { + "cell_type": "markdown", + "id": "3143955c-446a-4e6c-834c-583ab173f446", + "metadata": {}, + "source": [ + "## visualize the lemma graph" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "05b409af-14df-4158-9709-ffe2d79e864b", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-24T17:26:10.024360Z", + "iopub.status.busy": "2024-01-24T17:26:10.020502Z", + "iopub.status.idle": "2024-01-24T17:26:10.321275Z", + "shell.execute_reply": "2024-01-24T17:26:10.319871Z", + "shell.execute_reply.started": "2024-01-24T17:26:10.024325Z" + }, + "scrolled": true + }, + "outputs": [], + "source": [ + "render: textgraphs.RenderPyVis = tg.create_render()\n", + "\n", + "pv_graph: pyvis.network.Network = render.render_lemma_graph(\n", + " debug = False,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "7b5d3e88-6669-4df1-a20a-587cc6a7db12", + "metadata": {}, + "source": [ + "initialize the layout parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "b212f5ed-03d6-439f-92ae-f2cbedb18609", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-24T17:26:11.343717Z", + "iopub.status.busy": "2024-01-24T17:26:11.343435Z", + "iopub.status.idle": "2024-01-24T17:26:11.385195Z", + "shell.execute_reply": "2024-01-24T17:26:11.379207Z", + "shell.execute_reply.started": "2024-01-24T17:26:11.343691Z" + } + }, + "outputs": [], + "source": [ + "pv_graph.force_atlas_2based(\n", + " gravity = -38,\n", + " central_gravity = 0.01,\n", + " spring_length = 231,\n", + " spring_strength = 0.7,\n", + " damping = 0.8,\n", + " overlap = 0,\n", + ")\n", + "\n", + "pv_graph.show_buttons(filter_ = [ \"physics\" ])\n", + "pv_graph.toggle_physics(True)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "2f952a7c-3130-49c9-b659-fb941e9e0bfe", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-24T17:26:12.529172Z", + "iopub.status.busy": "2024-01-24T17:26:12.528709Z", + "iopub.status.idle": "2024-01-24T17:26:12.951605Z", + "shell.execute_reply": "2024-01-24T17:26:12.915999Z", + "shell.execute_reply.started": "2024-01-24T17:26:12.529144Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tmp.fig01.html\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pv_graph.prep_notebook()\n", + "pv_graph.show(\"tmp.fig01.html\")" + ] + }, + { + "cell_type": "markdown", + "id": "dc6654c8-0a4c-4e62-8cfc-f49e33f81064", + "metadata": {}, + "source": [ + "## generate a word cloud" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "ba9543cd-b1e9-4f0a-930c-7a0a6ccb7f0a", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:42:47.192425Z", + "iopub.status.busy": "2024-01-17T01:42:47.191808Z", + "iopub.status.idle": "2024-01-17T01:42:47.414389Z", + "shell.execute_reply": "2024-01-17T01:42:47.413720Z", + "shell.execute_reply.started": "2024-01-17T01:42:47.192376Z" + } + }, + "outputs": [ + { + "data": { + "image/jpeg": "", + "image/png": "", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "wordcloud = render.generate_wordcloud()\n", + "display(wordcloud.to_image())" + ] + }, + { + "cell_type": "markdown", + "id": "9e55b207-16d2-488a-b89b-b6ea8aed0ad9", + "metadata": {}, + "source": [ + "## cluster communities in the lemma graph" + ] + }, + { + "cell_type": "markdown", + "id": "9bdcbe84-ae83-4fa6-91ad-3069b212dc72", + "metadata": {}, + "source": [ + "In the tutorial\n", + "
\"How to Convert Any Text Into a Graph of Concepts\", \n", + "Rahul Nayak uses the\n", + "girvan-newman\n", + "algorithm to split the graph into communities, then clusters on those communities.\n", + "His approach works well for unsupervised clustering of key phrases which have been extracted from many documents.\n", + "In contrast, Nayak was working with entities extracted from \"chunks\" of text, not with a text graph." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "cd2d2f21-966e-40d6-8335-20dbfd8316ed", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:42:47.416003Z", + "iopub.status.busy": "2024-01-17T01:42:47.415758Z", + "iopub.status.idle": "2024-01-17T01:42:48.383920Z", + "shell.execute_reply": "2024-01-17T01:42:48.383286Z", + "shell.execute_reply.started": "2024-01-17T01:42:47.415969Z" + } + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "render.draw_communities();" + ] + }, + { + "cell_type": "markdown", + "id": "bb59d135-5b14-4841-ba76-89712017e4d6", + "metadata": {}, + "source": [ + "## graph of relations transform" + ] + }, + { + "cell_type": "markdown", + "id": "d751fa5e-e6ca-4de6-a3f3-c9f8acb43e5e", + "metadata": {}, + "source": [ + "Show a transformed graph, based on _graph of relations_ (see: `lee2023ingram`)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "5ec1352a-f281-4965-b68d-3e86c0269f09", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-24T17:27:02.399419Z", + "iopub.status.busy": "2024-01-24T17:27:02.398846Z", + "iopub.status.idle": "2024-01-24T17:27:02.528662Z", + "shell.execute_reply": "2024-01-24T17:27:02.527016Z", + "shell.execute_reply.started": "2024-01-24T17:27:02.399365Z" + } + }, + "outputs": [], + "source": [ + "graph: textgraphs.GraphOfRelations = textgraphs.GraphOfRelations(\n", + " tg\n", + ")\n", + "\n", + "graph.seeds()\n", + "graph.construct_gor()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "a1dc17f1-eaeb-469a-8593-76950d70cc95", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:42:48.395746Z", + "iopub.status.busy": "2024-01-17T01:42:48.395315Z", + "iopub.status.idle": "2024-01-17T01:42:48.444015Z", + "shell.execute_reply": "2024-01-17T01:42:48.443074Z", + "shell.execute_reply.started": "2024-01-17T01:42:48.395667Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tmp.fig02.html\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scores: typing.Dict[ tuple, float ] = graph.get_affinity_scores()\n", + "pv_graph: pyvis.network.Network = graph.render_gor_pyvis(scores)\n", + "\n", + "pv_graph.force_atlas_2based(\n", + " gravity = -38,\n", + " central_gravity = 0.01,\n", + " spring_length = 231,\n", + " spring_strength = 0.7,\n", + " damping = 0.8,\n", + " overlap = 0,\n", + ")\n", + "\n", + "pv_graph.show_buttons(filter_ = [ \"physics\" ])\n", + "pv_graph.toggle_physics(True)\n", + "\n", + "pv_graph.prep_notebook()\n", + "pv_graph.show(\"tmp.fig02.html\")" + ] + }, + { + "cell_type": "markdown", + "id": "c191fde0-1093-4cdc-a3ea-86cc2bf394b8", + "metadata": {}, + "source": [ + "*What does this transform provide?*\n", + "\n", + "By using a _graph of relations_ dual representation of our graph data, first and foremost we obtain a more compact representation of the relations in the graph, and means of making inferences (e.g., _link prediction_) where there is substantially more invariance in the training data.\n", + "\n", + "Also recognize that for a parse graph of a paragraph in the English language, the most interesting nodes will probably be either subjects (`nsubj`) or direct objects (`pobj`). Here in the _graph of relations_ we see illustrated how the important details from _entity linking_ tend to cluster near either `nsubj` or `pobj` entities, connected through punctuation. This is not as readily observed in the earlier visualization of the _lemma graph_." + ] + }, + { + "cell_type": "markdown", + "id": "68ea1b7e-bed2-453b-b210-129ddb082e2f", + "metadata": {}, + "source": [ + "## extract as RDF triples" + ] + }, + { + "cell_type": "markdown", + "id": "ae76750c-feac-414d-8362-5ab92294c858", + "metadata": {}, + "source": [ + "Extract the nodes and edges which have IRIs, to create an \"abstraction layer\" as a semantic graph at a higher level of detail above the _lemma graph_:" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "d9036aec-7c38-4fd7-b2f5-4615bf95c643", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:42:48.446174Z", + "iopub.status.busy": "2024-01-17T01:42:48.445378Z", + "iopub.status.idle": "2024-01-17T01:42:48.478519Z", + "shell.execute_reply": "2024-01-17T01:42:48.476893Z", + "shell.execute_reply.started": "2024-01-17T01:42:48.446112Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "@base .\n", + "@prefix dbo: .\n", + "@prefix dbr: .\n", + "@prefix schema: .\n", + "@prefix skos: .\n", + "@prefix wd_ent: .\n", + "\n", + "dbr:Germany skos:definition \"Germany (German: Deutschland, German pronunciation: [ˈdɔʏtʃlant]), constitutionally the Federal\"@en ;\n", + " skos:prefLabel \"Germany\"@en .\n", + "\n", + "dbr:United_States skos:definition \"The United States of America (USA), commonly known as the United States (U.S. or US) or America\"@en ;\n", + " skos:prefLabel \"United States\"@en .\n", + "\n", + "dbr:Werner_Herzog skos:definition \"Werner Herzog (German: [ˈvɛɐ̯nɐ ˈhɛɐ̯tsoːk]; born 5 September 1942) is a German film director\"@en ;\n", + " skos:prefLabel \"Werner Herzog\"@en .\n", + "\n", + "wd_ent:Q183 skos:definition \"country in Central Europe\"@en ;\n", + " skos:prefLabel \"Germany\"@en .\n", + "\n", + "wd_ent:Q44131 skos:definition \"German film director, producer, screenwriter, actor and opera director\"@en ;\n", + " skos:prefLabel \"Werner Herzog\"@en .\n", + "\n", + " a dbo:Country ;\n", + " skos:prefLabel \"America\"@en ;\n", + " schema:event .\n", + "\n", + " a dbo:Person ;\n", + " skos:prefLabel \"Dietrich Herzog\"@en ;\n", + " schema:children .\n", + "\n", + " skos:prefLabel \"filmmaker\"@en .\n", + "\n", + " skos:prefLabel \"intellectual\"@en .\n", + "\n", + " skos:prefLabel \"son\"@en .\n", + "\n", + " a dbo:Person ;\n", + " skos:prefLabel \"Werner\"@en .\n", + "\n", + " a dbo:Country ;\n", + " skos:prefLabel \"Germany\"@en .\n", + "\n", + " skos:prefLabel \"war\"@en .\n", + "\n", + " a dbo:Person ;\n", + " skos:prefLabel \"Werner Herzog\"@en ;\n", + " schema:nationality .\n", + "\n", + "dbo:Country skos:definition \"Countries, cities, states\"@en ;\n", + " skos:prefLabel \"country\"@en .\n", + "\n", + "dbo:Person skos:definition \"People, including fictional\"@en ;\n", + " skos:prefLabel \"person\"@en .\n", + "\n", + "\n" + ] + } + ], + "source": [ + "triples: str = tg.export_rdf()\n", + "print(triples)" + ] + }, + { + "cell_type": "markdown", + "id": "ff49fe28-e75f-4590-8b87-0d8962928cba", + "metadata": {}, + "source": [ + "## statistical stack profile instrumentation" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "af4ecb06-370f-4077-9899-29a1673e4768", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:42:48.482588Z", + "iopub.status.busy": "2024-01-17T01:42:48.481127Z", + "iopub.status.idle": "2024-01-17T01:42:48.493047Z", + "shell.execute_reply": "2024-01-17T01:42:48.492253Z", + "shell.execute_reply.started": "2024-01-17T01:42:48.482444Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "profiler.stop()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "d5ac2ce6-15b1-41ad-8215-8a5f76036cf1", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:42:48.495272Z", + "iopub.status.busy": "2024-01-17T01:42:48.494829Z", + "iopub.status.idle": "2024-01-17T01:42:50.376362Z", + "shell.execute_reply": "2024-01-17T01:42:50.375698Z", + "shell.execute_reply.started": "2024-01-17T01:42:48.495244Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 17:41:51 Samples: 11163\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 57.137 CPU time: 72.235\n", + "/ _/ v4.6.1\n", + "\n", + "Program: /Users/paco/src/textgraphs/venv/lib/python3.10/site-packages/ipykernel_launcher.py -f /Users/paco/Library/Jupyter/runtime/kernel-8ffadb7d-3b45-4e0e-a94f-f098e5ad9fbe.json\n", + "\n", + "57.136 _UnixSelectorEventLoop._run_once asyncio/base_events.py:1832\n", + "└─ 57.135 Handle._run asyncio/events.py:78\n", + " [12 frames hidden] asyncio, ipykernel, IPython\n", + " 41.912 ZMQInteractiveShell.run_ast_nodes IPython/core/interactiveshell.py:3394\n", + " ├─ 20.701 ../ipykernel_5151/1245857438.py:1\n", + " │ └─ 20.701 TextGraphs.perform_entity_linking textgraphs/doc.py:534\n", + " │ └─ 20.701 KGWikiMedia.perform_entity_linking textgraphs/kg.py:306\n", + " │ ├─ 10.790 KGWikiMedia._link_kg_search_entities textgraphs/kg.py:932\n", + " │ │ └─ 10.787 KGWikiMedia.dbpedia_search_entity textgraphs/kg.py:641\n", + " │ │ └─ 10.711 get requests/api.py:62\n", + " │ │ [37 frames hidden] requests, urllib3, http, socket, ssl,...\n", + " │ ├─ 9.143 KGWikiMedia._link_spotlight_entities textgraphs/kg.py:851\n", + " │ │ └─ 9.140 KGWikiMedia.dbpedia_search_entity textgraphs/kg.py:641\n", + " │ │ └─ 9.095 get requests/api.py:62\n", + " │ │ [37 frames hidden] requests, urllib3, http, socket, ssl,...\n", + " │ └─ 0.768 KGWikiMedia._secondary_entity_linking textgraphs/kg.py:1060\n", + " │ └─ 0.768 KGWikiMedia.wikidata_search textgraphs/kg.py:575\n", + " │ └─ 0.765 KGWikiMedia._wikidata_endpoint textgraphs/kg.py:444\n", + " │ └─ 0.765 get requests/api.py:62\n", + " │ [7 frames hidden] requests, urllib3\n", + " └─ 19.514 ../ipykernel_5151/1708547378.py:1\n", + " ├─ 14.502 InferRel_Rebel.__init__ textgraphs/rel.py:121\n", + " │ └─ 14.338 pipeline transformers/pipelines/__init__.py:531\n", + " │ [39 frames hidden] transformers, torch, , json\n", + " ├─ 3.437 PipelineFactory.__init__ textgraphs/pipe.py:434\n", + " │ └─ 3.420 load spacy/__init__.py:27\n", + " │ [20 frames hidden] spacy, en_core_web_sm, catalogue, imp...\n", + " ├─ 0.900 InferRel_OpenNRE.__init__ textgraphs/rel.py:33\n", + " │ └─ 0.888 get_model opennre/pretrain.py:126\n", + " └─ 0.672 TextGraphs.create_pipeline textgraphs/doc.py:103\n", + " └─ 0.672 PipelineFactory.create_pipeline textgraphs/pipe.py:508\n", + " └─ 0.672 Pipeline.__init__ textgraphs/pipe.py:216\n", + " └─ 0.672 English.__call__ spacy/language.py:1016\n", + " [11 frames hidden] spacy, spacy_dbpedia_spotlight, reque...\n", + " 14.363 InferRel_Rebel.gen_triples_async textgraphs/pipe.py:188\n", + " ├─ 13.670 InferRel_Rebel.gen_triples textgraphs/rel.py:259\n", + " │ ├─ 12.439 InferRel_Rebel.tokenize_sent textgraphs/rel.py:145\n", + " │ │ └─ 12.436 TranslationPipeline.__call__ transformers/pipelines/text2text_generation.py:341\n", + " │ │ [42 frames hidden] transformers, torch, \n", + " │ └─ 1.231 KGWikiMedia.resolve_rel_iri textgraphs/kg.py:370\n", + " │ └─ 0.753 get_entity_dict_from_api qwikidata/linked_data_interface.py:21\n", + " │ [8 frames hidden] qwikidata, requests, urllib3\n", + " └─ 0.693 InferRel_OpenNRE.gen_triples textgraphs/rel.py:58\n", + "\n", + "\n" + ] + } + ], + "source": [ + "profiler.print()" + ] + }, + { + "cell_type": "markdown", + "id": "c47bcfd2-2bd6-49a5-8f1a-102d90edde39", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "## outro" + ] + }, + { + "cell_type": "markdown", + "id": "68bea4f9-aec2-4b28-8f08-a4034851d066", + "metadata": {}, + "source": [ + "_\\[ more parts are in progress, getting added to this demo \\]_" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/ex1_0.ipynb b/examples/ex1_0.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..07103680f7696c84037771221c1184f18e847e99 --- /dev/null +++ b/examples/ex1_0.ipynb @@ -0,0 +1,1387 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "c32bf0b9-1445-4ede-ae49-7dd63ff3b08e", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:35:37.268964Z", + "iopub.status.busy": "2024-01-17T01:35:37.268658Z", + "iopub.status.idle": "2024-01-17T01:35:37.284720Z", + "shell.execute_reply": "2024-01-17T01:35:37.282292Z", + "shell.execute_reply.started": "2024-01-17T01:35:37.268927Z" + } + }, + "outputs": [], + "source": [ + "# for use in tutorial and development; do not include this `sys.path` change in production:\n", + "import sys ; sys.path.insert(0, \"../\")" + ] + }, + { + "cell_type": "markdown", + "id": "c8ff5d81-110c-42ae-8aa7-ed4fffea40c6", + "metadata": {}, + "source": [ + "# reproduce results from the \"InGram\" paper" + ] + }, + { + "cell_type": "markdown", + "id": "1e847d0a-bc6c-470a-9fef-620ebbdbbbc3", + "metadata": {}, + "source": [ + "This is an attempt to reproduce the _graph of relations_ example given in `lee2023ingram`" + ] + }, + { + "cell_type": "markdown", + "id": "61d8d39a-23e4-48e7-b8f4-0dd724ccf586", + "metadata": {}, + "source": [ + "## environment" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "22489527-2ad5-4e3c-be23-f511e6bcf69f", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:35:37.296455Z", + "iopub.status.busy": "2024-01-17T01:35:37.295661Z", + "iopub.status.idle": "2024-01-17T01:35:45.520968Z", + "shell.execute_reply": "2024-01-17T01:35:45.519870Z", + "shell.execute_reply.started": "2024-01-17T01:35:37.296419Z" + }, + "scrolled": true + }, + "outputs": [], + "source": [ + "import os\n", + "import pathlib\n", + "import typing\n", + "\n", + "from icecream import ic\n", + "from pyinstrument import Profiler\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "import pyvis\n", + "\n", + "import textgraphs" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "438f5775-487b-493e-a172-59b652b94955", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:35:45.525301Z", + "iopub.status.busy": "2024-01-17T01:35:45.524842Z", + "iopub.status.idle": "2024-01-17T01:35:45.547432Z", + "shell.execute_reply": "2024-01-17T01:35:45.546101Z", + "shell.execute_reply.started": "2024-01-17T01:35:45.525270Z" + } + }, + "outputs": [], + "source": [ + "%load_ext watermark" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "adc052dd-5cca-4d11-b543-3f0999f4f883", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:35:45.548916Z", + "iopub.status.busy": "2024-01-17T01:35:45.548691Z", + "iopub.status.idle": "2024-01-17T01:35:45.592124Z", + "shell.execute_reply": "2024-01-17T01:35:45.590790Z", + "shell.execute_reply.started": "2024-01-17T01:35:45.548889Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Last updated: 2024-01-16T17:35:45.550539-08:00\n", + "\n", + "Python implementation: CPython\n", + "Python version : 3.10.11\n", + "IPython version : 8.20.0\n", + "\n", + "Compiler : Clang 13.0.0 (clang-1300.0.29.30)\n", + "OS : Darwin\n", + "Release : 21.6.0\n", + "Machine : x86_64\n", + "Processor : i386\n", + "CPU cores : 8\n", + "Architecture: 64bit\n", + "\n" + ] + } + ], + "source": [ + "%watermark" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "6e4618da-daf9-44c9-adbb-e5781dba5504", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:35:45.597302Z", + "iopub.status.busy": "2024-01-17T01:35:45.596553Z", + "iopub.status.idle": "2024-01-17T01:35:45.623704Z", + "shell.execute_reply": "2024-01-17T01:35:45.621991Z", + "shell.execute_reply.started": "2024-01-17T01:35:45.597251Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "matplotlib: 3.8.2\n", + "pandas : 2.1.4\n", + "pyvis : 0.3.2\n", + "textgraphs: 0.5.0\n", + "sys : 3.10.11 (v3.10.11:7d4cc5aa85, Apr 4 2023, 19:05:19) [Clang 13.0.0 (clang-1300.0.29.30)]\n", + "\n" + ] + } + ], + "source": [ + "%watermark --iversions" + ] + }, + { + "cell_type": "markdown", + "id": "1a04e3dc-57d8-43a4-a342-cc38b86fc6a6", + "metadata": {}, + "source": [ + "## load example graph" + ] + }, + { + "cell_type": "markdown", + "id": "7c567afd-2f44-4391-899a-da6aba3d222e", + "metadata": {}, + "source": [ + "load from a JSON file which replicates the data for the \"Figure 3\" example" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "630430c5-21dc-4897-9a4b-3b01baf3de17", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:35:45.625764Z", + "iopub.status.busy": "2024-01-17T01:35:45.625341Z", + "iopub.status.idle": "2024-01-17T01:35:45.633487Z", + "shell.execute_reply": "2024-01-17T01:35:45.632477Z", + "shell.execute_reply.started": "2024-01-17T01:35:45.625720Z" + } + }, + "outputs": [], + "source": [ + "graph: textgraphs.GraphOfRelations = textgraphs.GraphOfRelations(\n", + " textgraphs.SimpleGraph()\n", + ")\n", + "\n", + "ingram_path: pathlib.Path = pathlib.Path(os.getcwd()) / \"ingram.json\"\n", + "\n", + "graph.load_ingram(\n", + " ingram_path,\n", + " debug = False,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "01152885-f301-49b1-ab61-f5b19d81c036", + "metadata": {}, + "source": [ + "set up the statistical stack profiling" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "2a289117-301d-4027-ae1b-200201fb5f93", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:35:45.639466Z", + "iopub.status.busy": "2024-01-17T01:35:45.639216Z", + "iopub.status.idle": "2024-01-17T01:35:45.646105Z", + "shell.execute_reply": "2024-01-17T01:35:45.644476Z", + "shell.execute_reply.started": "2024-01-17T01:35:45.639439Z" + } + }, + "outputs": [], + "source": [ + "profiler: Profiler = Profiler()\n", + "profiler.start()" + ] + }, + { + "cell_type": "markdown", + "id": "bf9d4f99-b82b-4d11-a9a4-31d0337f4aa8", + "metadata": {}, + "source": [ + "## decouple graph edges into \"seeds\"" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "da6fcb0f-b2ac-4f74-af39-2c129c750cab", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:35:45.648335Z", + "iopub.status.busy": "2024-01-17T01:35:45.647905Z", + "iopub.status.idle": "2024-01-17T01:35:46.520730Z", + "shell.execute_reply": "2024-01-17T01:35:46.518237Z", + "shell.execute_reply.started": "2024-01-17T01:35:45.648291Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "--- triples in source graph ---\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ic| edge.src_node: 0, rel_id: 1, edge.dst_node: 1\n", + "ic| edge.src_node: 0, rel_id: 0, edge.dst_node: 2\n", + "ic| edge.src_node: 0, rel_id: 0, edge.dst_node: 3\n", + "ic| edge.src_node: 4, rel_id: 2, edge.dst_node: 2\n", + "ic| edge.src_node: 4, rel_id: 2, edge.dst_node: 3\n", + "ic| edge.src_node: 4, rel_id: 1, edge.dst_node: 5\n", + "ic| edge.src_node: 6, rel_id: 1, edge.dst_node: 5\n", + "ic| edge.src_node: 6, rel_id: 2, edge.dst_node: 7\n", + "ic| edge.src_node: 6, rel_id: 4, edge.dst_node: 8\n", + "ic| edge.src_node: 9, " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Steven_Spielberg Profession Director\n", + " Steven_Spielberg Directed Catch_Me_If_Can\n", + " Steven_Spielberg Directed Saving_Private_Ryan\n", + " Tom_Hanks ActedIn Catch_Me_If_Can\n", + " Tom_Hanks ActedIn Saving_Private_Ryan\n", + " Tom_Hanks Profession Actor\n", + " Mark_Hamil Profession Actor\n", + " Mark_Hamil ActedIn Star_Wars\n", + " Mark_Hamil BornIn California\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "rel_id: 5, edge.dst_node: 10\n", + "ic| edge.src_node: 9, rel_id: 4, edge.dst_node: 10\n", + "ic| edge.src_node: 9, rel_id: 3, edge.dst_node: 8\n", + "ic| edge.src_node: 11, rel_id: 4, edge.dst_node: 12\n", + "ic| edge.src_node: 11, rel_id: 3, edge.dst_node: 12\n", + "ic| edge.src_node: 11, rel_id: 3, edge.dst_node: 8\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Brad_Pitt Nationality USA\n", + " Brad_Pitt BornIn USA\n", + " Brad_Pitt LivedIn California\n", + " Clint_Eastwood BornIn San_Francisco\n", + " Clint_Eastwood LivedIn San_Francisco\n", + " Clint_Eastwood LivedIn California\n" + ] + } + ], + "source": [ + "graph.seeds(\n", + " debug = True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "a9c0fd41-45e9-4019-94bf-8e2cf5c33454", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:35:46.524005Z", + "iopub.status.busy": "2024-01-17T01:35:46.523531Z", + "iopub.status.idle": "2024-01-17T01:35:46.531929Z", + "shell.execute_reply": "2024-01-17T01:35:46.530922Z", + "shell.execute_reply.started": "2024-01-17T01:35:46.523965Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "--- nodes in source graph ---\n", + "n: 0, Steven_Spielberg\n", + " head: []\n", + " tail: [(0, 'Profession', 1), (0, 'Directed', 2), (0, 'Directed', 3)]\n", + "n: 1, Director\n", + " head: [(0, 'Profession', 1)]\n", + " tail: []\n", + "n: 2, Catch_Me_If_Can\n", + " head: [(0, 'Directed', 2), (4, 'ActedIn', 2)]\n", + " tail: []\n", + "n: 3, Saving_Private_Ryan\n", + " head: [(0, 'Directed', 3), (4, 'ActedIn', 3)]\n", + " tail: []\n", + "n: 4, Tom_Hanks\n", + " head: []\n", + " tail: [(4, 'ActedIn', 2), (4, 'ActedIn', 3), (4, 'Profession', 5)]\n", + "n: 5, Actor\n", + " head: [(4, 'Profession', 5), (6, 'Profession', 5)]\n", + " tail: []\n", + "n: 6, Mark_Hamil\n", + " head: []\n", + " tail: [(6, 'Profession', 5), (6, 'ActedIn', 7), (6, 'BornIn', 8)]\n", + "n: 7, Star_Wars\n", + " head: [(6, 'ActedIn', 7)]\n", + " tail: []\n", + "n: 8, California\n", + " head: [(6, 'BornIn', 8), (9, 'LivedIn', 8), (11, 'LivedIn', 8)]\n", + " tail: []\n", + "n: 9, Brad_Pitt\n", + " head: []\n", + " tail: [(9, 'Nationality', 10), (9, 'BornIn', 10), (9, 'LivedIn', 8)]\n", + "n: 10, USA\n", + " head: [(9, 'Nationality', 10), (9, 'BornIn', 10)]\n", + " tail: []\n", + "n: 11, Clint_Eastwood\n", + " head: []\n", + " tail: [(11, 'BornIn', 12), (11, 'LivedIn', 12), (11, 'LivedIn', 8)]\n", + "n: 12, San_Francisco\n", + " head: [(11, 'BornIn', 12), (11, 'LivedIn', 12)]\n", + " tail: []\n", + "\n", + "--- edges in source graph ---\n", + "e: 0, Directed\n", + "e: 1, Profession\n", + "e: 2, ActedIn\n", + "e: 3, LivedIn\n", + "e: 4, BornIn\n", + "e: 5, Nationality\n" + ] + } + ], + "source": [ + "graph.trace_source_graph()" + ] + }, + { + "cell_type": "markdown", + "id": "7e7cb5f3-132c-4999-81eb-4f6167a31c9e", + "metadata": {}, + "source": [ + "## construct a _graph of relations_" + ] + }, + { + "cell_type": "markdown", + "id": "105702ed-7f9c-42ca-a57b-f1b15a206acf", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-02T22:31:57.839227Z", + "iopub.status.busy": "2024-01-02T22:31:57.838113Z", + "iopub.status.idle": "2024-01-02T22:31:57.853374Z", + "shell.execute_reply": "2024-01-02T22:31:57.851669Z", + "shell.execute_reply.started": "2024-01-02T22:31:57.839155Z" + } + }, + "source": [ + "Transform the graph data into _graph of relations_" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "eae8da18-f1be-4673-94e7-7b633bab9bd1", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:35:46.534228Z", + "iopub.status.busy": "2024-01-17T01:35:46.533720Z", + "iopub.status.idle": "2024-01-17T01:35:48.718340Z", + "shell.execute_reply": "2024-01-17T01:35:48.715493Z", + "shell.execute_reply.started": "2024-01-17T01:35:46.534166Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ic| node_id: 0, len(seeds" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "--- transformed triples ---\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "): 3\n", + "ic| trans_arc: TransArc(pair_key=(0, 1),\n", + " a_rel=1,\n", + " b_rel=0,\n", + " node_id=0,\n", + " a_dir=,\n", + " b_dir=)\n", + "ic| trans_arc: TransArc(pair_key=(0, 1),\n", + " a_rel=1,\n", + " b_rel=0,\n", + " node_id=0,\n", + " a_dir=,\n", + " b_dir=)\n", + "ic| trans_arc: TransArc(pair_key=(0, 0),\n", + " a_rel=0,\n", + " b_rel=0,\n", + " node_id=0,\n", + " a_dir=,\n", + " b_dir=)\n", + "ic| node_id: 1, len(seeds" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "): 1\n", + "ic| node_id: 2, len(seeds): 2\n", + "ic| trans_arc: TransArc(pair_key=(0, 2),\n", + " a_rel=0,\n", + " b_rel=2,\n", + " node_id=2,\n", + " a_dir=,\n", + " b_dir=<" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " (0, 2) Directed.head Catch_Me_If_Can ActedIn.head\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "RelDir.HEAD: 0>)\n", + "ic| node_id: 3, len(seeds): 2\n", + "ic| trans_arc: TransArc(pair_key=(0, 2),\n", + " a_rel=0,\n", + " b_rel=2,\n", + " node_id=3,\n", + " a_dir=,\n", + " b_dir=)\n", + "ic| node_id" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " (0, 2) Directed.head Saving_Private_Ryan ActedIn.head\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + ": 4, len(seeds): 3\n", + "ic| trans_arc: TransArc(pair_key=(2, 2),\n", + " a_rel=2,\n", + " b_rel=2,\n", + " node_id=4,\n", + " a_dir=,\n", + " b_dir=)\n", + "ic| trans_arc: TransArc(pair_key=(1, 2),\n", + " a_rel=2,\n", + " b_rel=1,\n", + " node_id=4,\n", + " a_dir=,\n", + " b_dir=)\n", + "ic| trans_arc: TransArc(pair_key=(1, 2)" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " (2, 2) ActedIn.tail Tom_Hanks ActedIn.tail\n", + "\n", + " (1, 2) ActedIn.tail Tom_Hanks Profession.tail\n", + "\n", + " (1, 2) ActedIn.tail Tom_Hanks Profession.tail\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + ",\n", + " a_rel=2,\n", + " b_rel=1,\n", + " node_id=4,\n", + " a_dir=,\n", + " b_dir=)\n", + "ic|" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " node_id: 5, len(seeds): 2\n", + "ic| trans_arc: TransArc(pair_key=(1, 1),\n", + " a_rel=1,\n", + " b_rel=1,\n", + " " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " (1, 1) Profession.head Actor Profession.head\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "node_id=5,\n", + " a_dir=,\n", + " b_dir=)\n", + "ic| node_id: 6, len(seeds): 3\n", + "ic| trans_arc: TransArc(pair_key=(1, 2),\n", + " a_rel=1,\n", + " b_rel=2,\n", + " node_id=6,\n", + " a_dir=,\n", + " b_dir=)\n", + "ic| trans_arc: TransArc(pair_key=(1, 4),\n", + " a_rel=1,\n", + " b_rel=4,\n", + " node_id=6,\n", + " a_dir" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " (1, 4) Profession.tail Mark_Hamil BornIn.tail\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "=,\n", + " b_dir=)\n", + "ic| trans_arc: TransArc(pair_key=(2, 4),\n", + " a_rel=2,\n", + " b_rel=4,\n", + " node_id=6,\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " (2, 4) ActedIn.tail Mark_Hamil BornIn.tail\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " a_dir=,\n", + " b_dir=)\n", + "ic| node_id: 7, len(seeds): 1\n", + "ic| node_id: 8, len(seeds): 3\n", + "ic| trans_arc: TransArc(pair_key=(3, 4),\n", + " a_rel=4,\n", + " b_rel=3,\n", + " node_id=8,\n", + " a_dir=,\n", + " b_dir=)\n", + "ic| trans_arc: TransArc(pair_key=(3, 4),\n", + " a_rel=4,\n", + " b_rel=3,\n", + " node_id=8,\n", + " a_dir=,\n", + " b_dir=)\n", + "ic| trans_arc: TransArc(pair_key=(3, 3),\n", + " a_rel=3,\n", + " b_rel=3,\n", + " node_id=8,\n", + " a_dir=,\n", + " b_dir=)\n", + "ic| node_id: 9, len(seeds): 3\n", + "ic" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " (3, 4) BornIn.head California LivedIn.head\n", + "\n", + " (3, 3) LivedIn.head California LivedIn.head\n", + "\n", + " (4, 5) Nationality.tail Brad_Pitt BornIn.tail\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "| trans_arc: TransArc(pair_key=(4, 5),\n", + " a_rel=5,\n", + " b_rel=4,\n", + " node_id=9,\n", + " a_dir=,\n", + " b_dir=)\n", + "ic| trans_arc: TransArc(pair_key=(3, 5),\n", + " a_rel=5,\n", + " b_rel=3,\n", + " node_id=9,\n", + " a_dir=,\n", + " b_dir=<" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " (3, 5) Nationality.tail Brad_Pitt LivedIn.tail\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "RelDir.TAIL: 1>)\n", + "ic| trans_arc: TransArc(pair_key=(3, 4),\n", + " a_rel=4,\n", + " b_rel=3,\n", + " node_id=9,\n", + " a_dir=,\n", + " b_dir=)\n", + "ic| node_id: 10, len(seeds): 2\n", + "ic| trans_arc: TransArc(pair_key=(4, 5),\n", + " a_rel=5,\n", + " b_rel=4,\n", + " node_id=10,\n", + " a_dir=,\n", + " b_dir=)\n", + "ic| node_id: 11, len(seeds): 3\n", + "ic| trans_arc: TransArc(pair_key=(3, " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " (3, 4) BornIn.tail Brad_Pitt LivedIn.tail\n", + "\n", + " (4, 5) Nationality.head USA BornIn.head\n", + "\n", + " (3, 4) BornIn.tail Clint_Eastwood LivedIn.tail\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "4),\n", + " a_rel=4,\n", + " b_rel=3,\n", + " node_id=11,\n", + " a_dir=,\n", + " b_dir=)\n", + "ic" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " (3, 4) BornIn.tail Clint_Eastwood LivedIn.tail\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "| trans_arc: TransArc(pair_key=(3, 4),\n", + " a_rel=4,\n", + " b_rel=3,\n", + " node_id=11,\n", + " a_dir=,\n", + " b_dir=)\n", + "ic| trans_arc: TransArc(pair_key=(3, 3),\n", + " a_rel=3,\n", + " b_rel=3,\n", + " node_id=11,\n", + " a_dir=,\n", + " b_dir=)\n", + "ic| node_id: 12, len(seeds" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " (3, 3) LivedIn.tail Clint_Eastwood LivedIn.tail\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "): 2\n", + "ic| trans_arc: TransArc(pair_key=(3, 4),\n", + " a_rel=4,\n", + " b_rel=3,\n", + " node_id=12,\n", + " a_dir=,\n", + " b_dir=)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " (3, 4) BornIn.head San_Francisco LivedIn.head\n", + "\n" + ] + } + ], + "source": [ + "graph.construct_gor(\n", + "\tdebug = True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "d5a06b72-c19b-440c-83c7-332f28aa9586", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:35:48.731674Z", + "iopub.status.busy": "2024-01-17T01:35:48.731142Z", + "iopub.status.idle": "2024-01-17T01:35:48.745182Z", + "shell.execute_reply": "2024-01-17T01:35:48.739573Z", + "shell.execute_reply.started": "2024-01-17T01:35:48.731638Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "--- collect shared entity tallies ---\n", + "0 Directed\n", + " h: 4 dict_items([(2, 4.0)])\n", + " t: 6 dict_items([(0, 3.0), (1, 3.0)])\n", + "1 Profession\n", + " h: 3 dict_items([(1, 3.0)])\n", + " t: 10 dict_items([(0, 3.0), (2, 5.0), (4, 2.0)])\n", + "2 ActedIn\n", + " h: 4 dict_items([(0, 4.0)])\n", + " t: 10 dict_items([(1, 5.0), (2, 3.0), (4, 2.0)])\n", + "3 LivedIn\n", + " h: 8 dict_items([(3, 3.0), (4, 5.0)])\n", + " t: 10 dict_items([(3, 3.0), (4, 5.0), (5, 2.0)])\n", + "4 BornIn\n", + " h: 7 dict_items([(3, 5.0), (5, 2.0)])\n", + " t: 11 dict_items([(1, 2.0), (2, 2.0), (3, 5.0), (5, 2.0)])\n", + "5 Nationality\n", + " h: 2 dict_items([(4, 2.0)])\n", + " t: 4 dict_items([(3, 2.0), (4, 2.0)])\n" + ] + } + ], + "source": [ + "scores: typing.Dict[ tuple, float ] = graph.get_affinity_scores(\n", + " debug = True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "a3d2310b-11c1-476d-82ab-1e34bc496cb1", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:35:48.749266Z", + "iopub.status.busy": "2024-01-17T01:35:48.748905Z", + "iopub.status.idle": "2024-01-17T01:35:48.964799Z", + "shell.execute_reply": "2024-01-17T01:35:48.957975Z", + "shell.execute_reply.started": "2024-01-17T01:35:48.749231Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ic| scores: {(0, 0): 0.3,\n", + " (0, 1): 0.2653846153846154,\n", + " (0, 2): 0.34285714285714286,\n", + " (1, 1): 0.23076923076923078,\n", + " (1, 2): 0.3708791208791209,\n", + " (1, 4): 0.13247863247863248,\n", + " (2, 2): 0.21428571428571427,\n", + " (2, 4): 0.12698412698412698,\n", + " (3, 3): 0.3333333333333333,\n", + " (3, 4): 0.5555555555555556,\n", + " (3, 5): 0.2222222222222222,\n", + " (4, 5): 0.4444444444444444}\n" + ] + } + ], + "source": [ + "ic(scores);" + ] + }, + { + "cell_type": "markdown", + "id": "8b71b841-0cf5-4cc6-af4c-c85344b8f6c5", + "metadata": {}, + "source": [ + "## visualize the transform results" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "5901a49e-3f90-4061-9c3a-e9d1f05b40f3", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:35:48.973661Z", + "iopub.status.busy": "2024-01-17T01:35:48.973146Z", + "iopub.status.idle": "2024-01-17T01:35:49.339291Z", + "shell.execute_reply": "2024-01-17T01:35:49.337857Z", + "shell.execute_reply.started": "2024-01-17T01:35:48.973607Z" + } + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "graph.render_gor_plt(scores)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "8fa85274-6d16-48eb-b875-01108a9575b8", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:35:49.341965Z", + "iopub.status.busy": "2024-01-17T01:35:49.341537Z", + "iopub.status.idle": "2024-01-17T01:35:49.383683Z", + "shell.execute_reply": "2024-01-17T01:35:49.382725Z", + "shell.execute_reply.started": "2024-01-17T01:35:49.341916Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tmp.fig03.html\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pv_graph: pyvis.network.Network = graph.render_gor_pyvis(scores)\n", + "\n", + "pv_graph.force_atlas_2based(\n", + " gravity = -38,\n", + " central_gravity = 0.01,\n", + " spring_length = 231,\n", + " spring_strength = 0.7,\n", + " damping = 0.8,\n", + " overlap = 0,\n", + ")\n", + "\n", + "pv_graph.show_buttons(filter_ = [ \"physics\" ])\n", + "pv_graph.toggle_physics(True)\n", + "\n", + "pv_graph.prep_notebook()\n", + "pv_graph.show(\"tmp.fig03.html\")" + ] + }, + { + "cell_type": "markdown", + "id": "07cf6fca-af95-4cf0-9e3b-247521bafbff", + "metadata": {}, + "source": [ + "## analysis" + ] + }, + { + "cell_type": "markdown", + "id": "97af44dc-4e56-4986-9f54-cbfaff67e3d4", + "metadata": {}, + "source": [ + "As the results below above illustrate, the computed _affinity scores_ differ from what is published in `lee2023ingram`. After trying several different variations of interpretation for the paper's descriptions, the current approach provides the closest approximation that we have obtained." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "f64462b4-654a-4e2e-bea2-a36bdc5ec967", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:35:49.387402Z", + "iopub.status.busy": "2024-01-17T01:35:49.386218Z", + "iopub.status.idle": "2024-01-17T01:35:49.434520Z", + "shell.execute_reply": "2024-01-17T01:35:49.432123Z", + "shell.execute_reply.started": "2024-01-17T01:35:49.387333Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pairrel_arel_baffinityexpected
0(0, 0)DirectedDirected0.30NaN
1(0, 1)DirectedProfession0.270.22
2(0, 2)DirectedActedIn0.340.50
3(1, 1)ProfessionProfession0.23NaN
4(1, 2)ProfessionActedIn0.370.33
5(1, 4)ProfessionBornIn0.130.11
6(2, 2)ActedInActedIn0.21NaN
7(2, 4)ActedInBornIn0.130.11
8(3, 3)LivedInLivedIn0.33NaN
9(3, 4)LivedInBornIn0.560.81
10(3, 5)LivedInNationality0.220.11
11(4, 5)BornInNationality0.440.36
\n", + "
" + ], + "text/plain": [ + " pair rel_a rel_b affinity expected\n", + "0 (0, 0) Directed Directed 0.30 NaN\n", + "1 (0, 1) Directed Profession 0.27 0.22\n", + "2 (0, 2) Directed ActedIn 0.34 0.50\n", + "3 (1, 1) Profession Profession 0.23 NaN\n", + "4 (1, 2) Profession ActedIn 0.37 0.33\n", + "5 (1, 4) Profession BornIn 0.13 0.11\n", + "6 (2, 2) ActedIn ActedIn 0.21 NaN\n", + "7 (2, 4) ActedIn BornIn 0.13 0.11\n", + "8 (3, 3) LivedIn LivedIn 0.33 NaN\n", + "9 (3, 4) LivedIn BornIn 0.56 0.81\n", + "10 (3, 5) LivedIn Nationality 0.22 0.11\n", + "11 (4, 5) BornIn Nationality 0.44 0.36" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df: pd.DataFrame = graph.trace_metrics(scores)\n", + "df" + ] + }, + { + "cell_type": "markdown", + "id": "ff49fe28-e75f-4590-8b87-0d8962928cba", + "metadata": {}, + "source": [ + "## statistical stack profile instrumentation" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "af4ecb06-370f-4077-9899-29a1673e4768", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:35:49.437344Z", + "iopub.status.busy": "2024-01-17T01:35:49.436840Z", + "iopub.status.idle": "2024-01-17T01:35:49.444892Z", + "shell.execute_reply": "2024-01-17T01:35:49.444135Z", + "shell.execute_reply.started": "2024-01-17T01:35:49.437293Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "profiler.stop()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "d5ac2ce6-15b1-41ad-8215-8a5f76036cf1", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:35:49.446514Z", + "iopub.status.busy": "2024-01-17T01:35:49.446199Z", + "iopub.status.idle": "2024-01-17T01:35:49.728817Z", + "shell.execute_reply": "2024-01-17T01:35:49.728098Z", + "shell.execute_reply.started": "2024-01-17T01:35:49.446483Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 17:35:45 Samples: 2526\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 3.799 CPU time: 4.060\n", + "/ _/ v4.6.1\n", + "\n", + "Program: /Users/paco/src/textgraphs/venv/lib/python3.10/site-packages/ipykernel_launcher.py -f /Users/paco/Library/Jupyter/runtime/kernel-27f0c564-73f8-45ab-9f64-8b064ae1de10.json\n", + "\n", + "3.799 IPythonKernel.dispatch_queue ipykernel/kernelbase.py:525\n", + "└─ 3.791 IPythonKernel.process_one ipykernel/kernelbase.py:511\n", + " [10 frames hidden] ipykernel, IPython\n", + " 3.680 ZMQInteractiveShell.run_ast_nodes IPython/core/interactiveshell.py:3394\n", + " ├─ 2.176 ../ipykernel_4421/3358887201.py:1\n", + " │ └─ 2.176 GraphOfRelations.construct_gor textgraphs/gor.py:311\n", + " │ ├─ 1.607 IceCreamDebugger.__call__ icecream/icecream.py:204\n", + " │ │ [17 frames hidden] icecream, colorama, ipykernel, thread...\n", + " │ │ 1.078 lock.acquire \n", + " │ └─ 0.566 GraphOfRelations._transformed_triples textgraphs/gor.py:275\n", + " │ └─ 0.563 IceCreamDebugger.__call__ icecream/icecream.py:204\n", + " │ [13 frames hidden] icecream, colorama, ipykernel, zmq, t...\n", + " ├─ 0.866 ../ipykernel_4421/4061275008.py:1\n", + " │ └─ 0.866 GraphOfRelations.seeds textgraphs/gor.py:197\n", + " │ └─ 0.865 IceCreamDebugger.__call__ icecream/icecream.py:204\n", + " │ [42 frames hidden] icecream, inspect, posixpath, ../ipykernel_4421/559531165.py:1\n", + " │ ├─ 0.234 show matplotlib/pyplot.py:482\n", + " │ │ [32 frames hidden] matplotlib, matplotlib_inline, IPytho...\n", + " │ └─ 0.128 GraphOfRelations.render_gor_plt textgraphs/gor.py:522\n", + " │ └─ 0.104 draw_networkx networkx/drawing/nx_pylab.py:127\n", + " │ [6 frames hidden] networkx, matplotlib\n", + " ├─ 0.197 ../ipykernel_4421/1169542473.py:1\n", + " │ └─ 0.197 IceCreamDebugger.__call__ icecream/icecream.py:204\n", + " │ [14 frames hidden] icecream, colorama, ipykernel, thread...\n", + " └─ 0.041 ../ipykernel_4421/2247466716.py:1\n", + "\n", + "\n" + ] + } + ], + "source": [ + "profiler.print()" + ] + }, + { + "cell_type": "markdown", + "id": "c47bcfd2-2bd6-49a5-8f1a-102d90edde39", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "## outro" + ] + }, + { + "cell_type": "markdown", + "id": "68bea4f9-aec2-4b28-8f08-a4034851d066", + "metadata": {}, + "source": [ + "_\\[ more parts are in progress, getting added to this demo \\]_" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/ex2_0.ipynb b/examples/ex2_0.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..31b5fe7f22c60314f4911b47cafe2c289fd72aec --- /dev/null +++ b/examples/ex2_0.ipynb @@ -0,0 +1,627 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "c32bf0b9-1445-4ede-ae49-7dd63ff3b08e", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:35:52.002602Z", + "iopub.status.busy": "2024-01-17T01:35:52.001643Z", + "iopub.status.idle": "2024-01-17T01:35:52.021332Z", + "shell.execute_reply": "2024-01-17T01:35:52.018806Z", + "shell.execute_reply.started": "2024-01-17T01:35:52.002544Z" + } + }, + "outputs": [], + "source": [ + "# for use in tutorial and development; do not include this `sys.path` change in production:\n", + "import sys ; sys.path.insert(0, \"../\")" + ] + }, + { + "cell_type": "markdown", + "id": "c8ff5d81-110c-42ae-8aa7-ed4fffea40c6", + "metadata": {}, + "source": [ + "# bootstrap the _lemma graph_ with RDF triples" + ] + }, + { + "cell_type": "markdown", + "id": "1e847d0a-bc6c-470a-9fef-620ebbdbbbc3", + "metadata": {}, + "source": [ + "Show how to bootstrap definitions in a _lemma graph_ by loading RDF, e.g., for synonyms." + ] + }, + { + "cell_type": "markdown", + "id": "61d8d39a-23e4-48e7-b8f4-0dd724ccf586", + "metadata": {}, + "source": [ + "## environment" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "22489527-2ad5-4e3c-be23-f511e6bcf69f", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:35:52.030355Z", + "iopub.status.busy": "2024-01-17T01:35:52.029702Z", + "iopub.status.idle": "2024-01-17T01:35:59.577245Z", + "shell.execute_reply": "2024-01-17T01:35:59.576046Z", + "shell.execute_reply.started": "2024-01-17T01:35:52.030319Z" + }, + "scrolled": true + }, + "outputs": [], + "source": [ + "from icecream import ic\n", + "from pyinstrument import Profiler\n", + "import pyvis\n", + "\n", + "import textgraphs" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "438f5775-487b-493e-a172-59b652b94955", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:35:59.579567Z", + "iopub.status.busy": "2024-01-17T01:35:59.579060Z", + "iopub.status.idle": "2024-01-17T01:35:59.603599Z", + "shell.execute_reply": "2024-01-17T01:35:59.602072Z", + "shell.execute_reply.started": "2024-01-17T01:35:59.579536Z" + } + }, + "outputs": [], + "source": [ + "%load_ext watermark" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "adc052dd-5cca-4d11-b543-3f0999f4f883", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:35:59.605959Z", + "iopub.status.busy": "2024-01-17T01:35:59.605459Z", + "iopub.status.idle": "2024-01-17T01:35:59.655730Z", + "shell.execute_reply": "2024-01-17T01:35:59.654417Z", + "shell.execute_reply.started": "2024-01-17T01:35:59.605924Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Last updated: 2024-01-16T17:35:59.608787-08:00\n", + "\n", + "Python implementation: CPython\n", + "Python version : 3.10.11\n", + "IPython version : 8.20.0\n", + "\n", + "Compiler : Clang 13.0.0 (clang-1300.0.29.30)\n", + "OS : Darwin\n", + "Release : 21.6.0\n", + "Machine : x86_64\n", + "Processor : i386\n", + "CPU cores : 8\n", + "Architecture: 64bit\n", + "\n" + ] + } + ], + "source": [ + "%watermark" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "6e4618da-daf9-44c9-adbb-e5781dba5504", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:35:59.658604Z", + "iopub.status.busy": "2024-01-17T01:35:59.658083Z", + "iopub.status.idle": "2024-01-17T01:35:59.692941Z", + "shell.execute_reply": "2024-01-17T01:35:59.684789Z", + "shell.execute_reply.started": "2024-01-17T01:35:59.658572Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "pyvis : 0.3.2\n", + "textgraphs: 0.5.0\n", + "sys : 3.10.11 (v3.10.11:7d4cc5aa85, Apr 4 2023, 19:05:19) [Clang 13.0.0 (clang-1300.0.29.30)]\n", + "\n" + ] + } + ], + "source": [ + "%watermark --iversions" + ] + }, + { + "cell_type": "markdown", + "id": "23cefb5b-6ee7-4c33-8f82-a526cb9125d8", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-15T00:46:26.663615Z", + "iopub.status.busy": "2024-01-15T00:46:26.662220Z", + "iopub.status.idle": "2024-01-15T00:46:26.673766Z", + "shell.execute_reply": "2024-01-15T00:46:26.672702Z", + "shell.execute_reply.started": "2024-01-15T00:46:26.663477Z" + } + }, + "source": [ + "## load the bootstrap definitions" + ] + }, + { + "cell_type": "markdown", + "id": "89da700d-1e7f-4b24-901f-a36db8525add", + "metadata": {}, + "source": [ + "Define the bootstrap RDF triples in N3/Turtle format: we define an entity `Werner` as a synonym for `Werner Herzog` by using the [`skos:broader`](https://www.w3.org/TR/skos-reference/#semantic-relations) relation. Keep in mind that this entity may also refer to other Werners..." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e2412f6c-2c60-40d7-95f5-7bd281d522e7", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:35:59.695180Z", + "iopub.status.busy": "2024-01-17T01:35:59.694887Z", + "iopub.status.idle": "2024-01-17T01:35:59.711557Z", + "shell.execute_reply": "2024-01-17T01:35:59.704654Z", + "shell.execute_reply.started": "2024-01-17T01:35:59.695127Z" + } + }, + "outputs": [], + "source": [ + "TTL_STR: str = \"\"\"\n", + "@base .\n", + "@prefix dbo: .\n", + "@prefix skos: .\n", + "\n", + " a dbo:Person ;\n", + " skos:prefLabel \"Werner\"@en .\n", + "\n", + " a dbo:Person ;\n", + " skos:prefLabel \"Werner Herzog\"@en.\n", + "\n", + "dbo:Person skos:definition \"People, including fictional\"@en ;\n", + " skos:prefLabel \"person\"@en .\n", + "\n", + " skos:broader .\n", + "\"\"\"" + ] + }, + { + "cell_type": "markdown", + "id": "7c567afd-2f44-4391-899a-da6aba3d222e", + "metadata": {}, + "source": [ + "Provide the source text" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "630430c5-21dc-4897-9a4b-3b01baf3de17", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:35:59.718153Z", + "iopub.status.busy": "2024-01-17T01:35:59.717788Z", + "iopub.status.idle": "2024-01-17T01:35:59.734747Z", + "shell.execute_reply": "2024-01-17T01:35:59.732341Z", + "shell.execute_reply.started": "2024-01-17T01:35:59.718117Z" + } + }, + "outputs": [], + "source": [ + "SRC_TEXT: str = \"\"\" \n", + "Werner Herzog is a remarkable filmmaker and an intellectual originally from Germany, the son of Dietrich Herzog.\n", + "After the war, Werner fled to America to become famous.\n", + "\"\"\"" + ] + }, + { + "cell_type": "markdown", + "id": "01152885-f301-49b1-ab61-f5b19d81c036", + "metadata": {}, + "source": [ + "set up the statistical stack profiling" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "2a289117-301d-4027-ae1b-200201fb5f93", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:35:59.738759Z", + "iopub.status.busy": "2024-01-17T01:35:59.737750Z", + "iopub.status.idle": "2024-01-17T01:35:59.745742Z", + "shell.execute_reply": "2024-01-17T01:35:59.744107Z", + "shell.execute_reply.started": "2024-01-17T01:35:59.738713Z" + } + }, + "outputs": [], + "source": [ + "profiler: Profiler = Profiler()\n", + "profiler.start()" + ] + }, + { + "cell_type": "markdown", + "id": "bf9d4f99-b82b-4d11-a9a4-31d0337f4aa8", + "metadata": {}, + "source": [ + "set up the `TextGraphs` pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "da6fcb0f-b2ac-4f74-af39-2c129c750cab", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:35:59.749862Z", + "iopub.status.busy": "2024-01-17T01:35:59.749122Z", + "iopub.status.idle": "2024-01-17T01:36:03.042323Z", + "shell.execute_reply": "2024-01-17T01:36:03.040676Z", + "shell.execute_reply.started": "2024-01-17T01:35:59.749790Z" + } + }, + "outputs": [], + "source": [ + "tg: textgraphs.TextGraphs = textgraphs.TextGraphs(\n", + " factory = textgraphs.PipelineFactory(\n", + " kg = textgraphs.KGWikiMedia(\n", + " spotlight_api = textgraphs.DBPEDIA_SPOTLIGHT_API,\n", + " dbpedia_search_api = textgraphs.DBPEDIA_SEARCH_API,\n", + " dbpedia_sparql_api = textgraphs.DBPEDIA_SPARQL_API,\n", + " \t\twikidata_api = textgraphs.WIKIDATA_API,\n", + " min_alias = textgraphs.DBPEDIA_MIN_ALIAS,\n", + " min_similarity = textgraphs.DBPEDIA_MIN_SIM,\n", + " ),\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "e6f98bbc-6954-4e39-b5d6-f726816bd5c7", + "metadata": {}, + "source": [ + "load the bootstrap definitions" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "321a9a90-ae80-47d7-b392-020b06bd3066", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:36:03.044027Z", + "iopub.status.busy": "2024-01-17T01:36:03.043746Z", + "iopub.status.idle": "2024-01-17T01:36:03.071058Z", + "shell.execute_reply": "2024-01-17T01:36:03.070258Z", + "shell.execute_reply.started": "2024-01-17T01:36:03.043990Z" + } + }, + "outputs": [], + "source": [ + "tg.load_bootstrap_ttl(\n", + " TTL_STR,\n", + " debug = False,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "1db1fe56-52fe-4a01-9776-82908444dd6c", + "metadata": {}, + "source": [ + "parse the input text" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "f7f6665e-19da-4a25-a405-adbb5dfb3e88", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:36:03.072882Z", + "iopub.status.busy": "2024-01-17T01:36:03.072607Z", + "iopub.status.idle": "2024-01-17T01:36:03.751536Z", + "shell.execute_reply": "2024-01-17T01:36:03.750042Z", + "shell.execute_reply.started": "2024-01-17T01:36:03.072843Z" + } + }, + "outputs": [], + "source": [ + "pipe: textgraphs.Pipeline = tg.create_pipeline(\n", + " SRC_TEXT.strip(),\n", + ")\n", + "\n", + "tg.collect_graph_elements(\n", + " pipe,\n", + " debug = False,\n", + ")\n", + "\n", + "tg.construct_lemma_graph(\n", + " debug = False,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "3143955c-446a-4e6c-834c-583ab173f446", + "metadata": {}, + "source": [ + "## visualize the lemma graph" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "05b409af-14df-4158-9709-ffe2d79e864b", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:36:03.762865Z", + "iopub.status.busy": "2024-01-17T01:36:03.762378Z", + "iopub.status.idle": "2024-01-17T01:36:03.773217Z", + "shell.execute_reply": "2024-01-17T01:36:03.769536Z", + "shell.execute_reply.started": "2024-01-17T01:36:03.762817Z" + }, + "scrolled": true + }, + "outputs": [], + "source": [ + "render: textgraphs.RenderPyVis = tg.create_render()\n", + "\n", + "pv_graph: pyvis.network.Network = render.render_lemma_graph(\n", + " debug = False,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "7b5d3e88-6669-4df1-a20a-587cc6a7db12", + "metadata": {}, + "source": [ + "initialize the layout parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "b212f5ed-03d6-439f-92ae-f2cbedb18609", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:36:03.776399Z", + "iopub.status.busy": "2024-01-17T01:36:03.775428Z", + "iopub.status.idle": "2024-01-17T01:36:03.784525Z", + "shell.execute_reply": "2024-01-17T01:36:03.783464Z", + "shell.execute_reply.started": "2024-01-17T01:36:03.776310Z" + } + }, + "outputs": [], + "source": [ + "pv_graph.force_atlas_2based(\n", + " gravity = -38,\n", + " central_gravity = 0.01,\n", + " spring_length = 231,\n", + " spring_strength = 0.7,\n", + " damping = 0.8,\n", + " overlap = 0,\n", + ")\n", + "\n", + "pv_graph.show_buttons(filter_ = [ \"physics\" ])\n", + "pv_graph.toggle_physics(True)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "2f952a7c-3130-49c9-b659-fb941e9e0bfe", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:36:03.788862Z", + "iopub.status.busy": "2024-01-17T01:36:03.787641Z", + "iopub.status.idle": "2024-01-17T01:36:03.848366Z", + "shell.execute_reply": "2024-01-17T01:36:03.847499Z", + "shell.execute_reply.started": "2024-01-17T01:36:03.788773Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tmp.fig04.html\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pv_graph.prep_notebook()\n", + "pv_graph.show(\"tmp.fig04.html\")" + ] + }, + { + "cell_type": "markdown", + "id": "e57d42a8-4414-4f27-9817-b9339e65346f", + "metadata": {}, + "source": [ + "Notice how the `Werner` and `Werner Herzog` nodes are now linked? This synonym from the bootstrap definitions above provided means to link more portions of the _lemma graph_ than the demo in `ex0_0` with the same input text." + ] + }, + { + "cell_type": "markdown", + "id": "ff49fe28-e75f-4590-8b87-0d8962928cba", + "metadata": {}, + "source": [ + "## statistical stack profile instrumentation" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "af4ecb06-370f-4077-9899-29a1673e4768", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:36:03.849937Z", + "iopub.status.busy": "2024-01-17T01:36:03.849635Z", + "iopub.status.idle": "2024-01-17T01:36:03.856645Z", + "shell.execute_reply": "2024-01-17T01:36:03.855799Z", + "shell.execute_reply.started": "2024-01-17T01:36:03.849877Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "profiler.stop()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "d5ac2ce6-15b1-41ad-8215-8a5f76036cf1", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-17T01:36:03.857987Z", + "iopub.status.busy": "2024-01-17T01:36:03.857704Z", + "iopub.status.idle": "2024-01-17T01:36:04.615855Z", + "shell.execute_reply": "2024-01-17T01:36:04.615084Z", + "shell.execute_reply.started": "2024-01-17T01:36:03.857962Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " _ ._ __/__ _ _ _ _ _/_ Recorded: 17:35:59 Samples: 2846\n", + " /_//_/// /_\\ / //_// / //_'/ // Duration: 4.111 CPU time: 3.294\n", + "/ _/ v4.6.1\n", + "\n", + "Program: /Users/paco/src/textgraphs/venv/lib/python3.10/site-packages/ipykernel_launcher.py -f /Users/paco/Library/Jupyter/runtime/kernel-4365d4ba-2d4d-4d4b-83e2-eb5ef8abfe26.json\n", + "\n", + "4.111 IPythonKernel.dispatch_shell ipykernel/kernelbase.py:378\n", + "└─ 4.075 IPythonKernel.execute_request ipykernel/kernelbase.py:721\n", + " [9 frames hidden] ipykernel, IPython\n", + " 3.995 ZMQInteractiveShell.run_ast_nodes IPython/core/interactiveshell.py:3394\n", + " ├─ 3.250 ../ipykernel_4433/1372904243.py:1\n", + " │ └─ 3.248 PipelineFactory.__init__ textgraphs/pipe.py:434\n", + " │ └─ 3.232 load spacy/__init__.py:27\n", + " │ [98 frames hidden] spacy, en_core_web_sm, catalogue, imp...\n", + " │ 0.496 tokenizer_factory spacy/language.py:110\n", + " │ └─ 0.108 _validate_special_case spacy/tokenizer.pyx:573\n", + " │ 0.439 spacy/language.py:2170\n", + " │ └─ 0.085 _validate_special_case spacy/tokenizer.pyx:573\n", + " ├─ 0.672 ../ipykernel_4433/3257668275.py:1\n", + " │ └─ 0.669 TextGraphs.create_pipeline textgraphs/doc.py:103\n", + " │ └─ 0.669 PipelineFactory.create_pipeline textgraphs/pipe.py:508\n", + " │ └─ 0.669 Pipeline.__init__ textgraphs/pipe.py:216\n", + " │ └─ 0.669 English.__call__ spacy/language.py:1016\n", + " │ [31 frames hidden] spacy, spacy_dbpedia_spotlight, reque...\n", + " └─ 0.055 ../ipykernel_4433/72966960.py:1\n", + " └─ 0.046 Network.prep_notebook pyvis/network.py:552\n", + " [5 frames hidden] pyvis, jinja2\n", + "\n", + "\n" + ] + } + ], + "source": [ + "profiler.print()" + ] + }, + { + "cell_type": "markdown", + "id": "c47bcfd2-2bd6-49a5-8f1a-102d90edde39", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "## outro" + ] + }, + { + "cell_type": "markdown", + "id": "68bea4f9-aec2-4b28-8f08-a4034851d066", + "metadata": {}, + "source": [ + "_\\[ more parts are in progress, getting added to this demo \\]_" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/fish.py b/examples/fish.py new file mode 100644 index 0000000000000000000000000000000000000000..00fddd29ef58d646e76afd2e118dabe2ad255b47 --- /dev/null +++ b/examples/fish.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +`spacyfishing` entity linking to Wikidata + +""" + +from icecream import ic # pylint: disable=E0401 +import spacy # pylint: disable=E0401 + + +SRC_TEXT: str = """ +Werner Herzog is a remarkable filmmaker and an intellectual originally from Germany, the son of Dietrich Herzog, although they never spoke after the war. +""" + +nlp = spacy.load( + "en_core_web_sm", + exclude = [ "ner" ], +) + +nlp.add_pipe( + "span_marker", + config = { + "model": "tomaarsen/span-marker-roberta-large-ontonotes5", + }, +) + +nlp.add_pipe( + "entityfishing", + config = { + "api_ef_base": "https://cloud.science-miner.com/nerd/service", + "extra_info": True, + "filter_statements": [ ], + }, +) + +nlp.add_pipe( + "merge_entities", +) + + +doc = nlp(SRC_TEXT.strip()) + +for ent in doc.ents: + ic( + ent.text, + ent.label_, + ent._.nerd_score, + ent._.url_wikidata, + ent._.description, + ent._.other_ids, + ) diff --git a/examples/gen_kg.py b/examples/gen_kg.py new file mode 100644 index 0000000000000000000000000000000000000000..4537dcbae939dff9c8ce0586c9e3be9c88c45d48 --- /dev/null +++ b/examples/gen_kg.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +`replicate` demo from + +""" + +import typing + +import replicate # pylint: disable=E0401 + + +if __name__ == "__main__": + # load `Notus` model: + model: replicate.model.Model = replicate.models.get( + "titocosta/notus-7b-v1", + ) + + version: replicate.version.Version = model.versions.get( + "dbcd2277b32873525e618545e13e64c3ba121b681cbd2b5f0ee7f95325e7a395", + ) + + prompt: str = """ +Sentence: {} +Extract RDF predicate from the sentence in this format: +SUBJECT: +PREDICATE: +OBJECT: + """ + + text: str = """ +Werner Herzog is a German film director, screenwriter, author, actor, and opera director, regarded as a pioneer of New German Cinema. + """ + + output: typing.Iterator[ str ] = replicate.run( + version, + input = { + "prompt": prompt.format(text.strip()).strip(), + }, + ) + + for item in output: + print(item) diff --git a/examples/ingram.json b/examples/ingram.json new file mode 100644 index 0000000000000000000000000000000000000000..9324037df32cda9fbfc85ce3eda9a633a8dbf9a5 --- /dev/null +++ b/examples/ingram.json @@ -0,0 +1,49 @@ +{ + "rels": [ + "Directed", + "Profession", + "ActedIn", + "LivedIn", + "BornIn", + "Nationality" + ], + + "ents": { + "Steven_Spielberg": [ + [ "Profession", "Director" ], + [ "Directed", "Catch_Me_If_Can" ], + [ "Directed", "Saving_Private_Ryan" ] + ], + "Tom_Hanks": [ + [ "ActedIn", "Catch_Me_If_Can" ], + [ "ActedIn", "Saving_Private_Ryan" ], + [ "Profession", "Actor" ] + ], + "Mark_Hamil": [ + [ "Profession", "Actor" ], + [ "ActedIn", "Star_Wars" ], + [ "BornIn", "California" ] + ], + "Brad_Pitt": [ + [ "Nationality", "USA" ], + [ "BornIn", "USA" ], + [ "LivedIn", "California" ] + ], + "Clint_Eastwood": [ + [ "BornIn", "San_Francisco" ], + [ "LivedIn", "San_Francisco" ], + [ "LivedIn", "California" ] + ] + }, + + "scores": [ + [0, 1, 0.22], + [0, 2, 0.50], + [1, 2, 0.33], + [1, 4, 0.11], + [2, 4, 0.11], + [3, 4, 0.81], + [3, 5, 0.11], + [4, 5, 0.36] + ] +} diff --git a/examples/notus.py b/examples/notus.py new file mode 100644 index 0000000000000000000000000000000000000000..d359738952a8b47a28e185cd06668e298b335b16 --- /dev/null +++ b/examples/notus.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Example use of `transformers` from HF model card for `Notus` +""" + +from transformers import pipeline # pylint: disable=E0401 +import torch # pylint: disable=E0401 + + +pipe = pipeline( + "text-generation", + model = "argilla/notus-7b-v1", + torch_dtype = torch.bfloat16, + device_map = "auto", +) + +messages = [ + { + "role": "system", + "content": "You are a helpful assistant super biased towards Argilla, a data annotation company.", # pylint: disable=C0301 + }, + { + "role": "user", + "content": "What's the best data annotation company out there in your opinion?", # pylint: disable=C0301 + }, +] + +prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) +outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95) + +generated_text = outputs[0]["generated_text"] +print(generated_text) diff --git a/examples/sense.py b/examples/sense.py new file mode 100644 index 0000000000000000000000000000000000000000..70d7650ce3adf38fb7654b4c7a7ba94aedfa0c79 --- /dev/null +++ b/examples/sense.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +`sense2vec` demo from + +""" + +from icecream import ic # pylint: disable=E0401 +import spacy # pylint: disable=E0401 + +if __name__ == "__main__": + nlp = spacy.load("en_core_web_sm") + s2v = nlp.add_pipe("sense2vec") + s2v.from_disk("./s2v_old") + + text: str = """ +A sentence about natural language, AI, and NLP. + """ + + doc = nlp(text.strip()) + + for ent in doc.ents: + ic(ent) + + try: + for lemma_tuple, prob in ent._.s2v_most_similar(3): + ic(lemma_tuple, prob) + + freq = ent._.s2v_freq + ic(freq) + except ValueError as ex: + ic(ex) diff --git a/examples/wiki.py b/examples/wiki.py new file mode 100644 index 0000000000000000000000000000000000000000..acdd9824d7dca2f3b12dd659e98040995d11926b --- /dev/null +++ b/examples/wiki.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +`spaCy-entity-linker` demo from + +""" + +from icecream import ic # pylint: disable=E0401 +import spacy # pylint: disable=E0401 +import spacy_entity_linker as sel # pylint: disable=E0401 + + +def link_wikidata ( + doc: spacy.tokens.doc.Doc, + ) -> None: + """ +Run an entity linking classifier for wikidata + """ + classifier = sel.EntityClassifier.EntityClassifier() + + for ent in doc.ents: + print() + ic(ent.text, ent.label_) + + # build a term (a simple span) then identify all + # the candidate entities for it + term: sel.TermCandidate = sel.TermCandidate.TermCandidate(ent) + + candidates: sel.EntityCandidates.EntityCandidates = term.get_entity_candidates() + ic(candidates) + + if len(candidates) > 0: + # select the best candidate + entity: sel.EntityElement.EntityElement = classifier(candidates) + + ic(entity.__dict__) + ic(entity.get_sub_entities(limit=10)) + ic(entity.get_super_entities(limit=10)) + + +if __name__ == "__main__": + SRC_TEXT: str = """ +Werner Herzog is a remarkable filmmaker and an intellectual originally from Germany, the son of Dietrich Herzog. +After the war, Werner fled to America to become famous. +""" + + # initialize language model + nlp: spacy.Language = spacy.load("en_core_web_sm") + sample_doc: spacy.tokens.doc.Doc = nlp(SRC_TEXT.strip()) + + link_wikidata(sample_doc) diff --git a/gor.py b/gor.py new file mode 100644 index 0000000000000000000000000000000000000000..eefefa94cbd51584454d0a1cf74fde834f01e905 --- /dev/null +++ b/gor.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Experiment with deserializing a node-link graph, +then transform it into a _graph of relations_ +""" + +import pathlib +import typing + +from icecream import ic # pylint: disable=E0401 +import matplotlib.pyplot as plt # pylint: disable=E0401 +import pandas as pd # pylint: disable=E0401 + +import textgraphs + + +if __name__ == "__main__": + graph: textgraphs.GraphOfRelations = textgraphs.GraphOfRelations( + textgraphs.SimpleGraph() + ) + + graph.load_ingram( + pathlib.Path("examples/ingram.json"), + debug = False, # True + ) + + graph.seeds( + debug = True, # False + ) + + graph.trace_source_graph() + + graph.construct_gor( + debug = True, # False + ) + + _scores: typing.Dict[ tuple, float ] = graph.get_affinity_scores( + debug = True, # False + ) + + df: pd.DataFrame = graph.trace_metrics(_scores) + ic(df) + + graph.render_gor_plt(_scores) + plt.show() diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000000000000000000000000000000000000..030cb3759b311493722cae5f82a8ff900e221b6a --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,82 @@ +site_name: TextGraphs +site_description: Explore uses of large language models (LLMs) in semi-automated knowledge graph (KG) construction from unstructured text sources, with human-in-the-loop (HITL) affordances to incorporate guidance from domain experts. +site_url: https://github.com/DerwenAI/textgraphs +site_author: TextGraphs contributors, with Derwen, Inc. + +repo_url: https://github.com/DerwenAI/textgraphs +repo_name: DerwenAI/textgraphs + +copyright: Source code and documentation are licensed under an MIT License; Copyright © 2023-2024 Derwen, Inc. + +nav: + - Home: + - Overview: index.md + - Getting Started: start.md + + - Project Report (DRAFT): + - Introduction: + - Abstract: abstract.md + - Project Objectives: objectives.md + - Natural Language Processing: nlp.md + - Graph Representation: graph.md + - Related Work: related.md + - Definitions: + - Lemma Graph: lemma.md + - Probabilistic Graph Features: prob.md + - Graph Levels of Detail: glod.md + - Topological Transforms: topo.md + - Methods: + - Technical Approach: methods.md + - Implementation Details: details.md + - Leveraging Domain Expertise: hitl.md + - Data-First Strategy: strategy.md + - Conclusions: conclude.md + - Research Guides: + - Acknowledgements: ack.md + - Bibliography: biblio.md + - Glossary: glossary.md + - Appendix: + - ML OSS Evaluation Rubric: rubric.md + + - Tutorial: + - Syllabus: tutorial.md + - Example Usage: ex0_0.md + - Replicating "InGram": ex1_0.md + - Using Bootstrap Definitions: ex2_0.md + + - Technical Reference: + - Build Instructions: build.md + - Package Reference: ref.md + +theme: + name: material + icon: + repo: fontawesome/brands/github + favicon: assets/favicon.png + logo: assets/logo.png + features: + - navigation.instant + +plugins: + - mknotebooks + - git-revision-date + +extra_css: + - stylesheets/extra.css + +extra_javascript: + - javascripts/config.js + - https://polyfill.io/v3/polyfill.min.js?features=es6 + - https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js + +use_directory_urls: true + +markdown_extensions: + - admonition + - codehilite + - footnotes + - pymdownx.arithmatex: + generic: true + - toc: + toc_depth: 3 + permalink: true diff --git a/pkg_doc.cfg b/pkg_doc.cfg new file mode 100644 index 0000000000000000000000000000000000000000..494b82195432cd0603fe34926f282e1aa832fb3e --- /dev/null +++ b/pkg_doc.cfg @@ -0,0 +1,34 @@ +{ + "src_url": "https://github.com/DerwenAI/textgraphs/blob/main", + + "module": "textgraphs", + + "classes": [ + "TextGraphs", + "SimpleGraph", + "Node", + "Edge", + "EnumBase", + "NodeEnum", + "RelEnum", + "PipelineFactory", + "Pipeline", + "Component", + "NERSpanMarker", + "NounChunk", + "KnowledgeGraph", + "KGSearchHit", + "KGWikiMedia", + "LinkedEntity", + "InferRel", + "InferRel_OpenNRE", + "InferRel_Rebel", + "RenderPyVis", + "NodeStyle", + "GraphOfRelations", + "TransArc", + "RelDir", + "SheafSeed", + "Affinity" + ] +} diff --git a/pkg_doc.py b/pkg_doc.py new file mode 100755 index 0000000000000000000000000000000000000000..e7eb3efc601ebe5d080978ab2e65d1e13d1a1284 --- /dev/null +++ b/pkg_doc.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Generate the `apidocs` markdown needed for the package reference. +""" + +import importlib +import json +import sys + +import pyfixdoc + + +###################################################################### +## main entry point + +if __name__ == "__main__": + ref_md_file: str = sys.argv[1] + + # NB: `inspect` is picky about paths and current working directory + # this only works if run from the top-level directory of the repo + sys.path.insert(0, "../") + + with open("pkg_doc.cfg", "r", encoding="utf-8") as fp: + config: dict = json.load(fp) + + importlib.import_module(config["module"]) + + pkg_doc: pyfixdoc.PackageDoc = pyfixdoc.PackageDoc( + config["module"], + config["src_url"], + config["classes"], + ) + + # NB: uncomment to analyze/troubleshoot the results of `inspect` + #pkg_doc.show_all_elements(); sys.exit(0) + + # build the apidocs markdown + pkg_doc.build() + + # output the apidocs markdown + pkg_doc.write_markdown(ref_md_file) diff --git a/pyfixdoc.py b/pyfixdoc.py new file mode 100755 index 0000000000000000000000000000000000000000..20e47542da101dae6be8eac8af858bfdf485164e --- /dev/null +++ b/pyfixdoc.py @@ -0,0 +1,566 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=C0103,C0114,C0116,C0209,C0301,E0401,R0914,W0611,W0613,W0621,W0702,W1308,W1514 + +""" +Implementation of apidoc-ish documentation which generates actual +Markdown that can be used with MkDocs, and fits with Diátaxis design +principles for effective documentation. Because the others really +don't. + +In particular, this library... + + * is aware of type annotations (PEP 484, etc.) + * fixes Py version bugs related to `typing` and `inspect` + * handles forward references (prior to Python 3.8) + * links to source lines in a GitHub repo + * provides non-bassackwards parameter descriptions (eyes on *you*, GOOG) + * does not require use of a plugin + * uses `icecream` for debugging + * exists b/c Sphinx really sucks + +You're welcome. +""" + +import inspect +import os +import re +import sys +import traceback +import typing + +from icecream import ic # type: ignore # pylint: disable=E0401 + + +class PackageDoc: + """ +Because there doesn't appear to be any other Markdown-friendly +docstring support in Python. + +See also: + + * [PEP 256](https://www.python.org/dev/peps/pep-0256/) + * [`inspect`](https://docs.python.org/3/library/inspect.html) + """ + + PAT_PARAM = re.compile(r"( \S+.*\:\n(?:\S.*\n)+)", re.MULTILINE) + PAT_NAME = re.compile(r"^\s+(.*)\:\n(.*)") + PAT_FWD_REF = re.compile(r"ForwardRef\('(.*)'\)") + + + def __init__ ( + self, + module_name: str, + git_url: str, + class_list: typing.List[str], + ) -> None: + """ +Constructor, to configure a `PackageDoc` object. + + module_name: +name of the Python module + + git_url: +URL for the Git source repository + + class_list: +list of the classes to include in the apidocs + """ + self.module_name = module_name + self.git_url = git_url + self.class_list = class_list + + self.module_obj = sys.modules[self.module_name] + self.md: typing.List[str] = [ + "# Reference: `{}` package".format(self.module_name), + "API by Adnen Kadri from the Noun Project", + ] + + + def show_all_elements ( + self + ) -> None: + """ +Show all possible elements from `inspect` for the given module, for +debugging purposes. + """ + for name, obj in inspect.getmembers(self.module_obj): + for n, o in inspect.getmembers(obj): + ic(name, n, o) + ic(type(o)) + + + def write_markdown ( + self, + path: str, + ) -> None: + """ +Output the apidocs markdown to the given path. + + path: +path for the output file + """ + ic("writing", path) + + with open(path, "w") as f: + for line in self.md: + f.write(line) + f.write("\n") + + + def build ( + self + ) -> None: + """ +Build the apidocs documentation as markdown. + """ + todo_list:typing.Dict[ str, typing.Any] = self.get_todo_list() + + # markdown for top-level module description + self.md.extend(self.get_docstring(self.module_obj)) + + # find and format the class definitions + try: + for class_name in self.class_list: + self.format_class(todo_list, class_name) + except Exception as ex: # pylint: disable=W0718 + print(class_name) + ic(ex) + traceback.print_exc() + sys.exit(-1) + + # format the function definitions and types + self.format_functions() + self.format_types() + + + def get_todo_list ( + self + ) -> typing.Dict[ str, typing.Any]: + """ +Walk the module tree to find class definitions to document. + + returns: +a dictionary of class objects which need apidocs generated + """ + todo_list: typing.Dict[ str, typing.Any] = { + class_name: class_obj + for class_name, class_obj in inspect.getmembers(self.module_obj, inspect.isclass) + if class_name in self.class_list + } + + return todo_list + + + def get_docstring ( # pylint: disable=W0102 + self, + obj, + parse=False, + arg_dict: dict = {}, + ) -> typing.List[str]: + """ +Get the docstring for the given object. + + obj: +class definition for which its docstring will be inspected and parsed + + parse: +flag to parse docstring or use the raw text; defaults to `False` + + arg_dict: +optional dictionary of forward references, if parsed + + returns: +list of lines of markdown + """ + local_md: typing.List[str] = [] + raw_docstring = obj.__doc__ + + if raw_docstring: + docstring = inspect.cleandoc(raw_docstring) + + if parse: + local_md.append(self.parse_method_docstring(docstring, arg_dict)) + else: + local_md.append(docstring) + + local_md.append("\n") + + return local_md + + + def parse_method_docstring ( + self, + docstring: str, + arg_dict: dict, + ) -> str: + """ +Parse the given method docstring. + + docstring: +input docstring to be parsed + + arg_dict: +optional dictionary of forward references + + returns: +parsed/fixed docstring, as markdown + """ + local_md: typing.List[str] = [] + + for chunk in self.PAT_PARAM.split(docstring): + m_param = self.PAT_PARAM.match(chunk) + + if m_param: + param = m_param.group() + m_name = self.PAT_NAME.match(param) + + if m_name: + name = m_name.group(1).strip() + anno = self.fix_fwd_refs(arg_dict[name]) + descrip = m_name.group(2).strip() + + if name == "returns": + local_md.append("\n * *{}* : `{}` \n{}".format(name, anno, descrip)) + elif name == "yields": + local_md.append("\n * *{}* : \n{}".format(name, descrip)) + else: + local_md.append("\n * `{}` : `{}` \n{}".format(name, anno, descrip)) + else: + chunk = chunk.strip() + + if len(chunk) > 0: + local_md.append(chunk) + + return "\n".join(local_md) + + + def fix_fwd_refs ( + self, + anno: str, + ) -> typing.Optional[str]: + """ +Substitute the quoted forward references for a given module class. + + anno: +raw annotated type for the forward reference + + returns: +fixed forward reference, as markdown; or `None` if no annotation is supplied + """ + results: list = [] + + if not anno: + return None + + for term in anno.split(", "): + for chunk in self.PAT_FWD_REF.split(term): + if len(chunk) > 0: + results.append(chunk) + + return ", ".join(results) + + + def document_method ( + self, + path_list: list, + name: str, + obj: typing.Any, + func_kind: str, + ) -> typing.Tuple[int, typing.List[str]]: + """ +Generate apidocs markdown for the given class method. + + path_list: +elements of a class path, as a list + + name: +class method name + + obj: +class method object + + func_kind: +function kind + + returns: +line number, plus apidocs for the method as a list of markdown lines + """ + local_md: typing.List[str] = ["---"] + + # format a header + anchor + frag = ".".join(path_list + [ name ]) + anchor = "#### [`{}` {}](#{})".format(name, func_kind, frag) + local_md.append(anchor) + + # link to source code in Git repo + code = obj.__code__ + line_num = code.co_firstlineno + file = code.co_filename.replace(os.getcwd(), "") + + src_url = "[*\[source\]*]({}{}#L{})\n".format(self.git_url, file, line_num) # pylint: disable=W1401 + local_md.append(src_url) + + # format the callable signature + sig = inspect.signature(obj) + arg_list = self.get_arg_list(sig) + arg_list_str = "{}".format(", ".join([ a[0] for a in arg_list ])) + + local_md.append("```python") + local_md.append("{}({})".format(name, arg_list_str)) + local_md.append("```") + + # include the docstring, with return annotation + arg_dict: dict = { + name.split("=")[0]: anno + for name, anno in arg_list + } + + arg_dict["yields"] = None + + ret = sig.return_annotation + + if ret: + arg_dict["returns"] = self.extract_type_annotation(ret) + + local_md.extend(self.get_docstring(obj, parse=True, arg_dict=arg_dict)) + local_md.append("") + + return line_num, local_md + + + def get_arg_list ( + self, + sig: inspect.Signature, + ) -> list: + """ +Get the argument list for a given method. + + sig: +inspect signature for the method + + returns: +argument list of `(arg_name, type_annotation)` pairs + """ + arg_list: list = [] + + for param in sig.parameters.values(): + #ic(param.name, param.empty, param.default, param.annotation, param.kind) + + if param.name == "self": + pass + else: + if param.kind == inspect.Parameter.VAR_POSITIONAL: + name = "*{}".format(param.name) + elif param.kind == inspect.Parameter.VAR_KEYWORD: + name = "**{}".format(param.name) + elif param.default == inspect.Parameter.empty: + name = param.name + else: + if isinstance(param.default, str): + default_repr = repr(param.default).replace("'", '"') + else: + default_repr = param.default + + name = "{}={}".format(param.name, default_repr) + + anno = self.extract_type_annotation(param.annotation) + arg_list.append((name, anno)) + + return arg_list + + + @classmethod + def extract_type_annotation ( + cls, + sig: inspect.Signature, + ): + """ +Extract the type annotation for a given method, correcting `typing` +formatting problems as needed. + + sig: +inspect signature for the method + + returns: +corrected type annotation + """ + type_name = str(sig) + type_class = sig.__class__.__module__ + + try: + if type_class != "typing": + if type_name.startswith(" typing.List[str]: + """ +Generate apidocs markdown for the given type definition. + + path_list: +elements of a class path, as a list + + name: +type name + + obj: +type object + + returns: +apidocs for the type, as a list of lines of markdown + """ + local_md: typing.List[str] = [] + + # format a header + anchor + frag = ".".join(path_list + [ name ]) + anchor = "#### [`{}` {}](#{})".format(name, "type", frag) + local_md.append(anchor) + + # show type definition + local_md.append("```python") + local_md.append("{} = {}".format(name, obj)) + local_md.append("```") + local_md.append("") + + return local_md + + + @classmethod + def find_line_num ( + cls, + src: typing.Tuple[typing.List[str], int], + member_name: str, + ) -> int: + """ +Corrects for the error in parsing source line numbers of class methods that have decorators: + + + src: +list of source lines for the class being inspected + + member_name: +name of the class member to locate + + returns: +corrected line number of the method definition + """ + correct_line_num = -1 + + for line_num, line in enumerate(src[0]): + tokens = line.strip().split(" ") + + if tokens[0] == "def" and tokens[1] == member_name: + correct_line_num = line_num + + return correct_line_num + + + def format_class ( + self, + todo_list: typing.Dict[ str, typing.Any], + class_name: str, + ) -> None: + """ +Format apidocs as markdown for the given class. + + todo_list: +list of classes to be documented + + class_name: +name of the class to document + """ + self.md.append("## [`{}` class](#{})".format(class_name, class_name)) # pylint: disable=W1308 + + class_obj = todo_list[class_name] + docstring = class_obj.__doc__ + src = inspect.getsourcelines(class_obj) + + if docstring: + # add the raw docstring for a class + self.md.append(docstring) + + obj_md_pos: typing.Dict[int, typing.List[str]] = {} + + for member_name, member_obj in inspect.getmembers(class_obj): + path_list = [self.module_name, class_name] + + if member_name.startswith("__") or not member_name.startswith("_"): + if member_name not in class_obj.__dict__: + # inherited method + continue + + if inspect.isfunction(member_obj): + func_kind = "method" + elif inspect.ismethod(member_obj): + func_kind = "classmethod" + else: + continue + + _, obj_md = self.document_method(path_list, member_name, member_obj, func_kind) + line_num = self.find_line_num(src, member_name) + obj_md_pos[line_num] = obj_md + + for _, obj_md in sorted(obj_md_pos.items()): + self.md.extend(obj_md) + + + def format_functions ( + self + ) -> None: + """ +Walk the module tree, and for each function definition format its +apidocs as markdown. + """ + self.md.append("---") + self.md.append("## [module functions](#{})".format(self.module_name)) + + for func_name, func_obj in inspect.getmembers(self.module_obj, inspect.isfunction): + if not func_name.startswith("_"): + _, obj_md = self.document_method([self.module_name], func_name, func_obj, "function") + self.md.extend(obj_md) + + + def format_types ( + self + ) -> None: + """ +Walk the module tree, and for each type definition format its apidocs +as markdown. + """ + self.md.append("---") + self.md.append("## [module types](#{})".format(self.module_name)) + + for name, obj in inspect.getmembers(self.module_obj): + if obj.__class__.__module__ == "typing": + if not str(obj).startswith("~"): + obj_md = self.document_type([self.module_name], name, obj) + self.md.extend(obj_md) + + +###################################################################### +## test entry point + +if __name__ == "__main__": + pkg_doc = PackageDoc( + "foo", + "http://example.com/", + [], + ) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..fe950c836328b632e2f582e7894bb35ab7d81892 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,114 @@ +[build-system] + +build-backend = "setuptools.build_meta" + +requires = [ + "setuptools >= 69.0", + "setuptools_scm[toml] >= 6.2", + "wheel >= 0.42", +] + + +[tool.setuptools] + +packages = [ "textgraphs" ] + + +[tool.setuptools_scm] + +# required section; empty contents is fine + + +[project.urls] + +home = "https://huggingface.co/spaces/DerwenAI/textgraphs" +docs = "https://derwen.ai/docs/txg/" +code = "https://github.com/DerwenAI/textgraphs" +PyPi = "https://pypi.org/project/textgraphs/" +DOI = "https://zenodo.org/doi/10.5281/zenodo.10431783" + + +[project] + +name = "textgraphs" +dynamic = ["version"] + +authors = [ + { name = "derwen.ai", email = "info@derwen.ai" }, +] + +description = "TextGraphs + LLMs + graph ML for entity extraction, linking, ranking, and constructing a lemma graph" +readme = "README.md" +license = { file = "LICENSE" } + +requires-python = ">=3.10" + +classifiers = [ + "Development Status :: 3 - Alpha", + "License :: OSI Approved :: MIT License", + "Natural Language :: English", + "Intended Audience :: Developers", + "Intended Audience :: Education", + "Intended Audience :: Information Technology", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Scientific/Engineering :: Human Machine Interfaces", + "Topic :: Scientific/Engineering :: Information Analysis", + "Topic :: Scientific/Engineering :: Visualization", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Text Processing :: Indexing", + "Topic :: Text Processing :: Linguistic", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 3.10", +] + +dependencies = [ + "beautifulsoup4 >= 4.12", + "GitPython >= 3.1", + "icecream >= 2.1", + "markdown2 >= 2.4", + "matplotlib >= 3.8", + "networkx >= 3.2", + "open-nre >= 0.1.1", + "pulp >= 2.7", + "pyinstrument >= 4.6", + "pyvis >= 0.3", + "qwikidata >= 0.4", + "rdflib >= 7.0", + "spacy >= 3.7", + "spacy-dbpedia-spotlight >= 0.2.6", + "span_marker >= 1.5", + "transformers >= 4.35", + "wordcloud >= 1.9", +] + + +[project.optional-dependencies] + +dev = [ + "build >= 1.0", + "Flask >= 3.0", + "mkdocs-git-revision-date-plugin >= 0.3", + "mkdocs-material >= 9.5", + "mknotebooks >= 0.8", + "pre-commit >= 3.5", + "selenium >= 4.16", + "twine >= 4.0", +] + +test = [ + "pytest >= 7.4", + "deepdiff >= 6.7", +] + +demo = [ + "ipywidgets >= 8.1", + "jupyterlab_execute_time >= 3.1", + "jupyterlab >= 4.0", + "kuzu >= 0.1", + "sense2vec >= 2.0", + "spacy-entity-linker >= 1.0", + "streamlit < 1.29", + "watermark >= 2.4", +] diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c7d1a6be500780dcf3e4631ec323e99affb5606 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,18 @@ +build >= 1.0 +deepdiff >= 6.7 +Flask >= 3.0 +ipywidgets >= 8.1 +mkdocs-git-revision-date-plugin >= 0.3 +mkdocs-material >= 9.5 +mknotebooks >= 0.8 +jupyterlab >= 4.0 +jupyterlab_execute_time >= 3.1 +kuzu >= 0.1 +pre-commit >= 3.5 +pytest >= 7.4 +selenium >= 4.16 +sense2vec >= 2.0 +spacy-entity-linker >= 1.0 +streamlit < 1.29 +twine >= 4.0 +watermark >= 2.4 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..ee61afbbf44cc693ce96f44e604999b7d602051e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,21 @@ +# this is required and used for specifically HF Spaces deployment, not PyPi + +spacy >= 3.7 +https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl + +beautifulsoup4 >= 4.12 +GitPython >= 3.1 +icecream >= 2.1 +markdown2 >= 2.4 +matplotlib >= 3.8 +networkx >= 3.2 +open-nre >= 0.1.1 +pulp >= 2.7 +pyinstrument >= 4.6 +pyvis >= 0.3 +qwikidata >= 0.4 +rdflib >= 7.0 +spacy-dbpedia-spotlight >= 0.2.6 +span_marker >= 1.5 +transformers >= 4.35 +wordcloud >= 1.9 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..a97e11841a2cf311b0877bafad783697ec9a1798 --- /dev/null +++ b/setup.py @@ -0,0 +1,7 @@ +""" +PyPi legacy support +https://setuptools.pypa.io/en/latest/userguide/quickstart.html#setup-py +""" + +from setuptools import setup +setup() diff --git a/tests/test_extract.py b/tests/test_extract.py new file mode 100644 index 0000000000000000000000000000000000000000..8a2b9f874a1c279b8a1653b57ae2210d9ca8b69c --- /dev/null +++ b/tests/test_extract.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +unit tests: + + * extract the top-k entities from a raw text + +see copyright/license https://huggingface.co/spaces/DerwenAI/textgraphs/blob/main/README.md +""" + +from os.path import abspath, dirname +import pathlib +import sys + +sys.path.insert(0, str(pathlib.Path(dirname(dirname(abspath(__file__)))))) +import textgraphs # pylint: disable=C0413 + + +def test_extract_herzog ( + ) -> None: + """ +Run an extract with the Werner Herzog blurb. + """ + text: str = """ +Werner Herzog is a remarkable filmmaker and intellectual originally from Germany, the son of Dietrich Herzog. + """ + + tg: textgraphs.TextGraphs = textgraphs.TextGraphs( # pylint: disable=C0103 + factory = textgraphs.PipelineFactory(), + ) + + pipe: textgraphs.Pipeline = tg.create_pipeline( + text.strip(), + ) + + tg.collect_graph_elements( + pipe, + debug = False, + ) + + tg.perform_entity_linking( + pipe, + debug = False, + ) + + tg.construct_lemma_graph( + debug = False, + ) + + tg.calc_phrase_ranks( + debug = False, + ) + + results: list = [ + ( row["text"], row["pos"], ) + for _, row in tg.get_phrases_as_df().iterrows() + ] + + # top-k, k=4 + results = results[:4] + + expects: list = [ + ("Germany", "PROPN"), + ("Werner Herzog", "PROPN"), + ("Dietrich Herzog", "PROPN"), + ] + + for pair in expects: + assert pair in results + + +if __name__ == "__main__": + test_extract_herzog() diff --git a/tests/test_load.py b/tests/test_load.py new file mode 100644 index 0000000000000000000000000000000000000000..2604aba5e16bb76f85a4b85f2eaab966ee387402 --- /dev/null +++ b/tests/test_load.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +unit tests: + + * serialization and deserialization + +see copyright/license https://huggingface.co/spaces/DerwenAI/textgraphs/blob/main/README.md +""" + +from os.path import abspath, dirname +import json +import pathlib +import sys + +import deepdiff # pylint: disable=E0401 + +sys.path.insert(0, str(pathlib.Path(dirname(dirname(abspath(__file__)))))) +import textgraphs # pylint: disable=C0413 + + +def test_load_minimal ( + *, + debug: bool = False, + ) -> None: + """ +Construct a _lemma graph_ from a minimal example, then compare +serialized and deserialized data to ensure no fields get corrupted +in the conversions. + """ + text: str = """ +See Spot run. + """ + + tg: textgraphs.TextGraphs = textgraphs.TextGraphs() # pylint: disable=C0103 + pipe: textgraphs.Pipeline = tg.create_pipeline(text.strip()) + + # serialize into node-link format + tg.collect_graph_elements(pipe) + tg.construct_lemma_graph() + tg.calc_phrase_ranks() + + json_str: str = tg.dump_lemma_graph() + exp_graph = json.loads(json_str) + + # deserialize from node-link format + tg = textgraphs.TextGraphs() # pylint: disable=C0103 + tg.load_lemma_graph(json_str) + tg.construct_lemma_graph() + + obs_graph: dict = json.loads(tg.dump_lemma_graph()) + + if debug: + print(obs_graph) + + # compare + diff: deepdiff.diff.DeepDiff = deepdiff.DeepDiff(exp_graph, obs_graph) + + if debug: + print(diff) + + if len(diff) > 0: + print(json.dumps(json.loads(diff.to_json()), indent = 2)) + + assert len(diff) == 0 + + +if __name__ == "__main__": + test_load_minimal(debug = True) diff --git a/textgraphs/__init__.py b/textgraphs/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0423dc09bc224b4b19b87029bce261c8ce2a1d43 --- /dev/null +++ b/textgraphs/__init__.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Package definitions for the `TextGraphs` library. + +see copyright/license https://huggingface.co/spaces/DerwenAI/textgraphs/blob/main/README.md +""" + +from .defaults import DBPEDIA_MIN_ALIAS, DBPEDIA_MIN_SIM, \ + DBPEDIA_SEARCH_API, DBPEDIA_SPARQL_API, DBPEDIA_SPOTLIGHT_API, \ + FISHING_API, MAX_SKIP, MREBEL_MODEL, \ + NER_MODEL, OPENNRE_MIN_PROB, OPENNRE_MODEL, \ + PAGERANK_ALPHA, SPACY_MODEL, WIKIDATA_API + +from .doc import TextGraphs + +from .elem import Edge, KGSearchHit, LinkedEntity, Node, NodeEnum, NounChunk, RelEnum + +from .gor import Affinity, GraphOfRelations, RelDir, SheafSeed, TransArc + +from .graph import SimpleGraph + +from .kg import KGWikiMedia + +from .ner import NERSpanMarker + +from .pipe import Component, InferRel, KnowledgeGraph, Pipeline, PipelineFactory + +from .rel import InferRel_OpenNRE, InferRel_Rebel + +from .util import EnumBase, \ + calc_quantile_bins, root_mean_square, stripe_column + +from .version import get_repo_version, \ + __version__, __version_major__, __version_minor__, __version_patch__ + +from .vis import NodeStyle, RenderPyVis + + +__release__ = __version__ + +__title__ = "TextGraphs: raw texts, LLMs, and KGs, oh my!" + +__description__ = "TextGraphs + LLMs + graph ML for entity extraction, linking, ranking, and constructing a lemma graph" # pylint: disable=C0301 + +__copyright__ = "2023-2024, Derwen, Inc." + +__author__ = """\n""".join([ + "derwen.ai " +]) diff --git a/textgraphs/defaults.py b/textgraphs/defaults.py new file mode 100644 index 0000000000000000000000000000000000000000..73c5af83157114df9c758b31bf0b323f005a71f4 --- /dev/null +++ b/textgraphs/defaults.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Default settings for the `TextGraphs` library. + +see copyright/license https://huggingface.co/spaces/DerwenAI/textgraphs/blob/main/README.md +""" + +import spacy_dbpedia_spotlight # pylint: disable=E0401 + + +DBPEDIA_MIN_ALIAS: float = 0.8 +DBPEDIA_MIN_SIM: float = 0.9 + +DBPEDIA_SEARCH_API: str = "https://lookup.dbpedia.org/api/search" +DBPEDIA_SPARQL_API: str = "https://dbpedia.org/sparql" +DBPEDIA_SPOTLIGHT_API: str = f"{spacy_dbpedia_spotlight.EntityLinker.base_url}/en" + +FISHING_API: str = "https://cloud.science-miner.com/nerd/service" + +MAX_SKIP: int = 11 + +MREBEL_MODEL: str = "Babelscape/mrebel-large" + +NER_MODEL: str = "tomaarsen/span-marker-roberta-large-ontonotes5" + +OPENNRE_MIN_PROB: float = 0.9 +OPENNRE_MODEL: str = "wiki80_cnn_softmax" + +PAGERANK_ALPHA: float = 0.85 + +SPACY_MODEL: str = "en_core_web_sm" + +WIKIDATA_API: str = "https://www.wikidata.org/w/api.php" diff --git a/textgraphs/doc.py b/textgraphs/doc.py new file mode 100644 index 0000000000000000000000000000000000000000..077840c1c70563a85363c5d0e3fac0fdb5d654ce --- /dev/null +++ b/textgraphs/doc.py @@ -0,0 +1,1353 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=C0302,R0801 + +""" +Implementation of an LLM-augmented `textgraph` algorithm for +constructing a _lemma graph_ from raw, unstructured text source. +The results provide elements for semi-automated construction or +augmentation of a _knowledge graph_. + +This class maintains the state of a graph. Updates get applied by +running methods on `Pipeline` objects, typically per paragraph. + +see copyright/license https://huggingface.co/spaces/DerwenAI/textgraphs/blob/main/README.md +""" + +from collections import defaultdict +import asyncio +import csv +import logging +import os +import pathlib +import re +import shutil +import sys +import tempfile +import typing +import zipfile + +from icecream import ic # pylint: disable=E0401 +import networkx as nx # pylint: disable=E0401 +import numpy as np # pylint: disable=E0401 +import pandas as pd # pylint: disable=E0401 +import pulp # pylint: disable=E0401 +import rdflib # pylint: disable=E0401 +import spacy # pylint: disable=E0401 +import transformers # pylint: disable=E0401 +import urllib3 # pylint: disable=E0401 + +from .defaults import PAGERANK_ALPHA +from .elem import Edge, Node, NodeEnum, RelEnum +from .graph import SimpleGraph +from .pipe import Pipeline, PipelineFactory +from .util import calc_quantile_bins, root_mean_square, stripe_column +from .vis import RenderPyVis + + +###################################################################### +## repair the libraries which are borked: + +# workaround: determine whether this is loading into a Jupyter +# notebook, to allow for `tqdm` progress bars +if "ipykernel" in sys.modules: + from tqdm.notebook import tqdm # pylint: disable=E0401,W0611 +else: + from tqdm import tqdm # pylint: disable=E0401 + +# override: HF `transformers` and `tokenizers` have noisy logging +transformers.logging.set_verbosity_error() +os.environ["TOKENIZERS_PARALLELISM"] = "0" + +# override: `OpenNRE` uses `word2vec` which has noisy logging +logging.disable(logging.INFO) + +# override: WikidMedia and others allow their SSL certs to expire +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + + +###################################################################### +## class definitions + +class TextGraphs (SimpleGraph): + """ +Construct a _lemma graph_ from the unstructured text source, +then extract ranked phrases using a `textgraph` algorithm. + """ + IRI_BASE: str = "https://github.com/DerwenAI/textgraphs/ns/" + + + def __init__ ( + self, + *, + factory: typing.Optional[ PipelineFactory ] = None, + iri_base: str = IRI_BASE, + ) -> None: + """ +Constructor. + + factory: +optional `PipelineFactory` used to configure components + """ + super().__init__() + + self.iri_base = iri_base + + # initialize the pipeline factory + if factory is not None: + self.factory = factory + else: + self.factory = PipelineFactory() + + + def create_pipeline ( + self, + text_input: str, + ) -> Pipeline: + """ +Use the pipeline factory to create a pipeline (e.g., `spaCy.Document`) +for each text input, which are typically paragraph-length. + + text_input: +raw text to be parsed by this pipeline + + returns: +a configured pipeline + """ + return self.factory.create_pipeline( + text_input, + ) + + + def create_render ( + self + ) -> RenderPyVis: + """ +Create an object for rendering the graph in `PyVis` HTML+JavaScript. + + returns: +a configured `RenderPyVis` object for generating graph visualizations + """ + return RenderPyVis( + self, + self.factory.kg, + ) + + + def _extract_phrases ( # pylint: disable=R0913 + self, + pipe: Pipeline, + sent_id: int, + sent: spacy.tokens.span.Span, + text_id: int, + para_id: int, + lemma_iter: typing.Iterator[ typing.Tuple[ str, int ]], + *, + debug: bool = False, + ) -> typing.Iterator[ Node ]: + """ +Extract phrases from a parsed document to build nodes in the +_lemma graph_, while considering: + + 1. NER entities+labels + 2. lemmatized nouns and verbs + 3. noun chunks that overlap with entities + +as the ordered priorities. + + pipe: +configured pipeline for this document + + sent_id: +sentence identifier + + sent: +token span for the parsed sentence + + text_id: +text (top-level document) identifier + + para_id: +paragraph identitifer + + lemma_iter: +iterator for parsed lemmas + + debug: +debugging flag + + yields: +extracted entities represented as `Node` objects in the graph + """ + # extract entities using NER + ent_seq: typing.List[ spacy.tokens.span.Span ] = list(sent.ents) + + if debug: + ic(ent_seq) + + for token in sent: + head = ( token.head, token.head.i, ) + + if debug: + ic( + token, + token.i, + token.dep_, + head, + ) + + if len(ent_seq) > 0 and ent_seq[0].start == token.i: + # link a named entity + ent = ent_seq.pop(0) + lemma_key, span_len = next(lemma_iter) # pylint: disable=R1708 + + yield self.make_node( + pipe.tokens, + lemma_key, + token, + NodeEnum.ENT, + text_id, + para_id, + sent_id, + label = ent.label_, + length = span_len, + ) + + elif token.pos_ in [ "NOUN", "PROPN", "VERB" ]: + # link a lemmatized entity + yield self.make_node( + pipe.tokens, + Pipeline.get_lemma_key(token), + token, + NodeEnum.LEM, + text_id, + para_id, + sent_id, + ) + + else: + # fall-through case: use token as a placeholder in the lemma graph + yield self.make_node( + pipe.tokens, + Pipeline.get_lemma_key(token, placeholder = True), + token, + NodeEnum.DEP, + text_id, + para_id, + sent_id, + linked = False, + ) + + + def _make_class_link ( + self, + node: Node, + pipe: Pipeline, + *, + debug: bool = False, + ) -> None: + """ +Private helper method to construct a link to an entity's class. + + node: +recognized entity to be linked + + pipe: +configured pipeline for this document + + debug: +debugging flag + """ + if debug: + print("link:", node.label) + + # special case of `make_node()` + if node.label in self.nodes: + dst: Node = self.nodes[node.label] + dst.count += 1 + + else: + # find class IRI metadata + class_meta: typing.List[typing.Dict[ str, str ]] = [ + meta + for meta in pipe.kg.NER_MAP.values() + if meta["iri"] == node.label + ] + + dst = Node( + len(self.nodes), + node.label, # type: ignore + class_meta[0]["definition"], + str(rdflib.RDF.type), + NodeEnum.IRI, + label = class_meta[0]["label"], + length = len(class_meta[0]["label"].split(" ")), + count = 1, + ) + + self.nodes[node.label] = dst # type: ignore + + node.annotated = True + + # construct a directed edge between them + edge: Edge = self.make_edge( # type: ignore + node, + dst, + RelEnum.IRI, + str(rdflib.RDF.type), + node.weight, + debug = debug, + ) + + if debug: + ic(edge) + + if edge is not None: + pipe.edges.append(edge) + + + def _overlay_noun_chunks ( + self, + pipe: Pipeline, + *, + text_id: int = 0, + para_id: int = 0, + debug: bool = False, + ) -> None: + """ +Identify the unique noun chunks, i.e., those which differ from the +entities and lemmas that have already been linked in the lemma graph. + + pipe: +configured pipeline for this document + + text_id: +text (top-level document) identifier + + para_id: +paragraph identitifer + + debug: +debugging flag + """ + # scan the noun chunks for uniqueness + for chunk in pipe.link_noun_chunks(self.nodes): + if chunk.unseen: + location: typing.List[ int ] = [ + text_id, + para_id, + chunk.sent_id, + chunk.start, + ] + + if chunk.lemma_key in self.nodes: + node = self.nodes.get(chunk.lemma_key) + node.loc.append(location) # type: ignore + node.count += 1 # type: ignore + else: + node = Node( + len(self.nodes), + chunk.lemma_key, + chunk.text, + "noun_chunk", + NodeEnum.CHU, + span = chunk.span, + loc = [ location ], + length = chunk.length, + count = 1, + ) + + self.nodes[chunk.lemma_key] = node + + # add the related edges, which do not necessarily + # correspond 1:1 with the existing nodes + for token_id in range(chunk.start, chunk.start + chunk.length): + if debug: + ic(pipe.tokens[token_id]) + + edge: Edge = self.make_edge( + node, # type: ignore + pipe.tokens[token_id], + RelEnum.CHU, + "noun_chunk", + 1.0, + debug = debug, + ) + + if edge is not None: + pipe.edges.append(edge) + + + def collect_graph_elements ( + self, + pipe: Pipeline, + *, + text_id: int = 0, + para_id: int = 0, + debug: bool = False, + ) -> None: + """ +Collect the elements of a _lemma graph_ from the results of running +the `textgraph` algorithm. These elements include: parse dependencies, +lemmas, entities, and noun chunks. + +Make sure to call beforehand: `TextGraphs.create_pipeline()` + + pipe: +configured pipeline for this document + + text_id: +text (top-level document) identifier + + para_id: +paragraph identitifer + + debug: +debugging flag + """ + # parse each sentence + lemma_iter: typing.Iterator[ typing.Tuple[ str, int ]] = pipe.get_ent_lemma_keys() + + for sent_id, sent in enumerate(pipe.ner_doc.sents): + if debug: + ic(sent_id, sent, sent.start) + + sent_nodes: typing.List[ Node ] = list(self._extract_phrases( + pipe, + sent_id, + sent, + text_id, + para_id, + lemma_iter, + )) + + if debug: + ic(sent_nodes) + + for node in sent_nodes: + # re-map from OntoTypes4 to a formal IRI, if possible + # then link the inferred class + if node.kind == NodeEnum.ENT: + node.label = pipe.kg.remap_ner(node.label) + + if node.label is not None and re.search(r"http[s]*://", node.label) is not None: + self._make_class_link( + node, + pipe, + debug = debug, + ) + + # link parse elements, based on the token's head + head_idx: int = node.span.head.i # type: ignore + + if head_idx >= len(sent_nodes): + head_idx -= sent.start + + if debug: + ic(node, len(sent_nodes), node.span.head.i, node.span.head.text, head_idx) # type: ignore # pylint: disable=C0301 + + edge: Edge = self.make_edge( # type: ignore + node, + sent_nodes[head_idx], + RelEnum.DEP, + node.span.dep_, # type: ignore + 1.0, + debug = debug, + ) + + if edge is not None: + pipe.edges.append(edge) + + # annotate src nodes which are subjects or direct objects + if node.span.dep_ in [ "nsubj", "pobj" ]: # type: ignore + node.sub_obj = True + + # overlay unique noun chunks onto the parsed elements, + self._overlay_noun_chunks( + pipe, + text_id = text_id, + para_id = para_id, + debug = debug, + ) + + + def construct_lemma_graph ( + self, + *, + debug: bool = False, + ) -> None: + """ +Construct the base level of the _lemma graph_ from the collected +elements. This gets represented in `NetworkX` as a directed graph +with parallel edges. + +Make sure to call beforehand: `TextGraphs.collect_graph_elements()` + + debug: +debugging flag + """ + # add the nodes + self.lemma_graph.add_nodes_from([ + node.node_id + for node in self.nodes.values() + ]) + + # populate the minimum required node properties + for node_key, node in self.nodes.items(): + nx_node = self.lemma_graph.nodes[node.node_id] + nx_node["lemma"] = node_key + nx_node["count"] = node.count + nx_node["weight"] = node.weight + nx_node["kind"] = str(node.kind) + + if node.kind in [ NodeEnum.DEP ]: + nx_node["label"] = "" + elif node.kind in [ NodeEnum.IRI ]: + nx_node["label"] = self.factory.kg.normalize_prefix(node.label) # type: ignore + else: + nx_node["label"] = node.text + + if debug: + ic(nx_node) + + # add the edges and their properties + self.lemma_graph.add_edges_from([ + ( + edge.src_node, + edge.dst_node, + { + "kind": str(edge.kind), + "title": edge.rel, + "lemma": edge_key, + "weight": float(edge.count), + "prob": edge.prob, + "count": edge.count, + }, + ) + for edge_key, edge in self.edges.items() + ]) + + + ###################################################################### + ## entity linking + + def perform_entity_linking ( + self, + pipe: Pipeline, + *, + debug: bool = False, + ) -> None: + """ +Perform _entity linking_ based on the `KnowledgeGraph` object. + +Make sure to call beforehand: `TextGraphs.collect_graph_elements()` + + pipe: +configured pipeline for this document + + debug: +debugging flag + """ + pipe.kg.perform_entity_linking( + self, + pipe, + debug = debug, + ) + + # by default, link the baseline semantics + for node in pipe.tokens: + if node.kind == NodeEnum.LEM and node.label is None: + node.label = str(rdflib.OWL.Thing) + + + ###################################################################### + ## relation extraction + + def _infer_rel_construct_edge ( + self, + src: Node, + iri: str, + dst: Node, + *, + debug: bool = False, + ) -> Edge: + """ +Create an edge for the linked IRI, based on the input triple. + + src: +source node in the triple + + iri: +predicate IRI in the triple + + dst: +destination node in the triple + + debug: +debugging flag + + returns: +the constructed `Edge` object + """ + edge = self.make_edge( # type: ignore + src, + dst, + RelEnum.INF, + iri, + 1.0, + debug = debug, + ) + + if debug: + ic(edge) + + return edge # type: ignore + + + async def _consume_infer_rel ( + self, + queue: asyncio.Queue, + inferred_edges: typing.List[ Edge ], + *, + debug: bool = False, + ) -> None: + """ +Consume from queue: inferred relations represented as triples. + + queue: +queue of inference tasks to be performed + + inferred_edges: +a list collecting the `Edge` objects inferred during this processing + + debug: +debugging flag + """ + while True: + src, iri, dst = await queue.get() + + inferred_edges.append( + self._infer_rel_construct_edge( + src, + iri, + dst, + debug = debug, + ) + ) + + queue.task_done() + + + async def infer_relations_async ( + self, + pipe: Pipeline, + *, + debug: bool = False, + ) -> typing.List[ Edge ]: + """ +Gather triples representing inferred relations and build edges, +concurrently by running an async queue. + + +Make sure to call beforehand: `TextGraphs.collect_graph_elements()` + + pipe: +configured pipeline for this document + + debug: +debugging flag + + returns: +a list of the inferred `Edge` objects + """ + inferred_edges: typing.List[ Edge ] = [] + queue: asyncio.Queue = asyncio.Queue() + + producer_tasks: typing.List[ asyncio.Task ] = [ + asyncio.create_task( + producer.gen_triples_async( # type: ignore + pipe, + queue, + debug = debug, + ) + ) + for producer in pipe.infer_rels + ] + + consumer_task: asyncio.Task = asyncio.create_task( + self._consume_infer_rel( + queue, + inferred_edges, + debug = debug, + ) + ) + + # wait for producers to finish, + # await the remaining tasks, + # then cancel the now-idle consumer + await asyncio.gather(*producer_tasks) + + if debug: + ic("Queue: done producing") + + await queue.join() + consumer_task.cancel() + + if debug: + ic("Queue: done consuming") + + # update the graph + pipe.edges.extend(inferred_edges) + + return inferred_edges + + + def infer_relations ( + self, + pipe: Pipeline, + *, + debug: bool = False, + ) -> typing.List[ Edge ]: + """ +Gather triples representing inferred relations and build edges. + +Make sure to call beforehand: `TextGraphs.collect_graph_elements()` + + pipe: +configured pipeline for this document + + debug: +debugging flag + + returns: +a list of the inferred `Edge` objects + """ + inferred_edges: typing.List[ Edge ] = [ + self._infer_rel_construct_edge(src, iri, dst, debug = debug) + for infer_rel in pipe.infer_rels + for src, iri, dst in infer_rel.gen_triples(pipe, debug = debug) + ] + + # update the graph + pipe.edges.extend(inferred_edges) + + return inferred_edges + + + ###################################################################### + ## rank the extracted and linked phrases + + @classmethod + def _solve_restack_coeffs ( + cls, + sum_e: float, + sum_l: float, + min_e: float, + max_l: float, + *, + debug: bool = False, + ) -> typing.Tuple[ float, float ]: + """ +Solve for the rank coefficients using a `pulp` linear programming model. + + sum_e: +sum of the entity ranks + + sum_l: +sum of the lemma ranks + + min_e: +minimum among the entity ranks + + max_l: +maximum among the entity ranks + + debug: +debugging flag + + returns: +the calculated rank coefficients + """ + coef0: pulp.LpVariable = pulp.LpVariable("coef0", 0) # coef for ranked entities + coef1: pulp.LpVariable = pulp.LpVariable("coef1", 0) # coef for ranked lemmas + slack: pulp.LpVariable = pulp.LpVariable("slack", 0) # "stack gap" slack variable + + prob: pulp.LpProblem = pulp.LpProblem("restack_coeffs", pulp.LpMinimize) + prob += coef0 * sum_e + coef1 * sum_l + slack == 1.0 + prob += coef0 * min_e - coef1 * max_l - slack == 0.0 + prob += coef0 - coef1 >= 0 + + # final expression becomes the objective function + prob += slack + + status: int = prob.solve( + pulp.PULP_CBC_CMD(msg = False), + ) + + if debug: + ic(pulp.LpStatus[status]) + ic(pulp.value(coef0)) + ic(pulp.value(coef1)) + ic(pulp.value(slack)) + + return ( pulp.value(coef0), pulp.value(coef1), ) # type: ignore + + + def _restack_ranks ( # pylint: disable=R0914 + self, + ranks: typing.List[ float ], + *, + debug: bool = False, + ) -> typing.List[ float ]: + """ +Stack-rank the nodes so that entities have priority over lemmas. + + ranks: +list of calculated ranks per node + + debug: +debugging flag + + returns: +ordered list of re-stacked nodes + """ + # build a dataframe of node ranks and counts + df1: pd.DataFrame = pd.DataFrame.from_dict([ + { + "weight": ranks[node.node_id], + "count": node.get_stacked_count(), + "hood": node.neighbors, + "subobj": int(node.sub_obj), + } + for node in self.nodes.values() + ]) + + df1.loc[df1["count"] < 1, "weight"] = 0 + + # normalize by column and calculate quantiles + df2: pd.DataFrame = df1.apply(lambda x: x / x.max(), axis = 0) + bins: np.ndarray = calc_quantile_bins(len(df2.index)) + + # stripe each columns + df3: pd.DataFrame = pd.DataFrame([ + stripe_column(values, bins) + for _, values in df2.items() + ]).T + + # renormalize the ranks + df1["rank"] = df3.apply(root_mean_square, axis=1) + df1.loc[df1["count"] < 1, "rank"] = 0 + + rank_col: np.ndarray = df1["rank"].to_numpy() + rank_col /= sum(rank_col) + + # prepare to stack entities atop lemmas + df1["E"] = df1["rank"] + df1["L"] = df1["rank"] + + df1["entity"] = [ + node.kind == NodeEnum.ENT + for node in self.nodes.values() + ] + + df1.loc[~df1["entity"], "E"] = 0 + df1.loc[df1["entity"], "L"] = 0 + + if debug: + ic(df1) + + # partition the lists to be stacked + E: typing.List[ float ] = [ # pylint: disable=C0103 + rank + for rank in df1["E"].to_numpy() + if rank > 0.0 + ] + + L: typing.List[ float ] = [ # pylint: disable=C0103 + rank + for rank in df1["L"].to_numpy() + if rank > 0.0 + ] + + # just use the calculated ranks when either list is empty + if len(E) < 1 or len(L) < 1: + return ranks + + # configure a system of linear equations + coef0, coef1 = self._solve_restack_coeffs( + sum_e = sum(E), + sum_l = sum(L), + min_e = min(E), + max_l = max(L), + debug = debug, + ) + + df1["stacked"] = df1["E"] * coef0 + df1["L"] * coef1 + + if debug: + ic(df1) + + return list(df1["stacked"].to_numpy()) + + + def calc_phrase_ranks ( + self, + *, + pr_alpha: float = PAGERANK_ALPHA, + debug: bool = False, + ) -> None: + """ +Calculate the weights for each node in the _lemma graph_, then +stack-rank the nodes so that entities have priority over lemmas. + +Phrase ranks are normalized to sum to 1.0 and these now represent +the ranked entities extracted from the document. + +Make sure to call beforehand: `TextGraphs.construct_lemma_graph()` + + pr_alpha: +optional `alpha` parameter for the PageRank algorithm + + debug: +debugging flag + """ + for node in self.nodes.values(): + nx_node = self.lemma_graph.nodes[node.node_id] + neighbors: int = 0 + + try: + neighbors = len(list(nx.neighbors(self.lemma_graph, node.node_id))) + except Exception: # pylint: disable=W0718 + pass + finally: + node.neighbors = neighbors + nx_node["hood"] = neighbors + + # restack + ranks: typing.List[ float ] = self._restack_ranks( + list(nx.pagerank( + self.lemma_graph, + alpha = pr_alpha, + ).values()), + debug = debug, + ) + + # update the node weights + for i, node in enumerate(self.nodes.values()): + node.weight = ranks[i] + + + def get_phrases ( + self + ) -> typing.Iterator[ dict ]: + """ +Return the entities extracted from the document. + +Make sure to call beforehand: `TextGraphs.calc_phrase_ranks()` + + yields: +extracted entities + """ + for node in sorted( + [ + node + for node in self.nodes.values() + if node.weight > 0 + ], + key = lambda n: n.weight, + reverse = True, + ): + + label: str = self.factory.kg.normalize_prefix(node.get_linked_label()) # type: ignore # pylint: disable=C0301 + + yield { + "node_id": node.node_id, + "text": node.text, + "pos": node.pos, + "label": label, + "count": node.count, + "weight": node.weight, + } + + + def get_phrases_as_df ( + self + ) -> pd.DataFrame: + """ +Return the ranked extracted entities as a dataframe. + +Make sure to call beforehand: `TextGraphs.calc_phrase_ranks()` + + returns: +a `pandas.DataFrame` of the extracted entities + """ + return pd.DataFrame.from_dict(self.get_phrases()) + + + ###################################################################### + ## knowledge graph abstraction layer + + def export_rdf ( # pylint: disable=R0914 + self, + *, + lang: str = "en", + ) -> str: + """ +Extract the entities and relations which have IRIs as RDF triples. + + lang: +language identifier + + returns: +RDF triples N3 (Turtle) format as a string + """ + node_list: typing.List[ Node ] = list(self.nodes.values()) + node_keys: typing.List[ str ] = list(self.nodes.keys()) + ref_dict: typing.Dict[ int, rdflib.URIRef ] = {} + rdf_graph: rdflib.Graph = rdflib.Graph() + + # extract entities as RDF + for node_id, node in enumerate(self.nodes.values()): + if node.kind in [ NodeEnum.ENT, NodeEnum.LEM ]: + if node.pos not in [ "VERB" ]: + iri: str = f"{self.iri_base}entity/{node.key.replace(' ', '_').replace('.', '_')}" # pylint: disable=C0301 + subj: rdflib.URIRef = rdflib.URIRef(iri) + ref_dict[node_id] = subj + + rdf_graph.add(( + subj, + rdflib.SKOS.prefLabel, + rdflib.Literal(node.text, lang = lang), + )) + + if node.kind == NodeEnum.ENT and node.annotated: + cls_obj: rdflib.URIRef = rdflib.URIRef(node.label) + cls_id: int = node_keys.index(node.label) # type: ignore + ref_dict[cls_id] = cls_obj + + rdf_graph.add(( + subj, + rdflib.RDF.type, + cls_obj, + )) + + elif node.kind == NodeEnum.IRI: + subj = rdflib.URIRef(node.key) + ref_dict[node_id] = subj + + rdf_graph.add(( + subj, + rdflib.SKOS.prefLabel, + rdflib.Literal(node.label, lang = lang), + )) + + rdf_graph.add(( + subj, + rdflib.SKOS.definition, + rdflib.Literal(node.text, lang = lang), + )) + + # extract relations as RDF + for edge in self.edges.values(): + if edge.kind == RelEnum.INF: + if edge.src_node in ref_dict: + subj = ref_dict.get(edge.src_node) + else: + src_node: Node = node_list[edge.src_node] + subj = rdflib.URIRef(src_node.label) + ref_dict[edge.src_node] = subj + + if edge.dst_node in ref_dict: + obj: rdflib.URIRef = ref_dict.get(edge.dst_node) + else: + dst_node: Node = node_list[edge.dst_node] + obj = rdflib.URIRef(dst_node.label) + ref_dict[edge.dst_node] = obj + + rdf_graph.add(( + subj, + rdflib.URIRef(edge.rel), + obj, + )) + + # serialize as RDF triples + for prefix, iri in self.factory.kg.NS_PREFIX.items(): + rdf_graph.bind(prefix, rdflib.Namespace(iri)) + + n3_str: str = rdf_graph.serialize( + format = "n3", + base = self.iri_base, + ) + + return n3_str + + + def denormalize_iri ( + self, + uri_ref: rdflib.term.URIRef, + ) -> str: + """ +Discern between a parsed entity and a linked entity. + + returns: +_lemma_key_ for a parsed entity, the full IRI for a linked entity + """ + uri: str = str(uri_ref) + + if uri.startswith(self.iri_base): + return uri.replace(self.iri_base, "").replace("entity/", "").replace("_", ".") + + return uri + + + def load_bootstrap_ttl ( # pylint: disable=R0912,R0914 + self, + ttl_str: str, + *, + debug: bool = False, + ) -> None: + """ +Parse a TTL string with an RDF semantic graph representation to load +bootstrap definitions for the _lemma graph_ prior to parsing, e.g., +for synonyms. + + ttl_str: +RDF triples in TTL (Turtle/N3) format + + debug: +debugging flag + """ + rdf_graph: rdflib.Graph = rdflib.Graph() + rdf_graph.parse(data = ttl_str) + + rdf_nodes: typing.Dict[ str, dict ] = defaultdict(dict) + rdf_edges: typing.Set[ tuple ] = set() + + # parse the node data, tally the edges + for subj, pred, obj in rdf_graph: + uri: str = self.denormalize_iri(subj) + + if pred == rdflib.SKOS.prefLabel: + rdf_nodes[uri]["label"] = str(obj) + elif pred == rdflib.SKOS.definition: + rdf_nodes[uri]["descrip"] = str(obj) + + elif pred == rdflib.RDF.type: + dst: str = str(obj) + rdf_nodes[dst]["ref"] = True + rdf_nodes[uri]["type"] = dst + + else: + src: str = uri + rdf_nodes[src]["ref"] = True + + dst = self.denormalize_iri(obj) + rdf_nodes[dst]["ref"] = True + + rdf_edges.add(( str(pred), src, dst, )) + + # construct the nodes + for uri, node_dat in rdf_nodes.items(): + if "ref" in node_dat: + if debug: + ic(uri, node_dat) + + node_kind: NodeEnum = NodeEnum.ENT + + if re.search(r"http[s]*://", uri) is not None: + node_kind = NodeEnum.IRI + + node: Node = self.make_node( + [], + uri, + None, + node_kind, + 0, + 0, + 0, + label = node_dat["label"], + length = len(node_dat["label"].split(" ")), + ) + + node.count = 0 + node.loc = [] + + if "type" in node_dat: + node.pos = node_dat["type"] + + if "descrip" in node_dat: + node.text = node_dat["descrip"] + + if node_kind == NodeEnum.ENT: + node.text = node_dat["label"] + + if debug: + ic(node) + + # construct the edges + node_list: typing.List[ Node ] = list(self.nodes.values()) + + for rel, src, dst in rdf_edges: + src_node: Node = self.nodes[src] + dst_node: Node = self.nodes[dst] + + if debug: + print(rel, node_list.index(src_node), node_list.index(dst_node)) + + edge_kind: RelEnum = RelEnum.IRI + + if rel == str(rdflib.SKOS.broader): + edge_kind = RelEnum.SYN + + edge: Edge = self.make_edge( # type: ignore + src_node, + dst_node, + edge_kind, + rel, + 1.0, + debug = debug, + ) + + if debug: + ic(edge) + + + def export_kuzu ( # pylint: disable=R0912,R0914 + self, + *, + zip_name: str = "lemma.zip", + debug: bool = False, + ) -> str: + """ +Export a labeled property graph for KùzuDB (openCypher). + + debug: +debugging flag + + returns: +name of the generated ZIP file + """ + subdir: str = "cyp" + zip_dir: tempfile.TemporaryDirectory = tempfile.TemporaryDirectory() # pylint: disable=R1732 + incl_nodes: set = set() + + with zipfile.ZipFile( + zip_name, + mode = "w", + compression = zipfile.ZIP_DEFLATED, + compresslevel = 9, + ) as zip_fp: + # write the nodes table + nodes_path: pathlib.Path = pathlib.Path(zip_dir.name) / "nodes.csv" + + with open(nodes_path, "w", encoding = "utf-8") as fp: # pylint: disable=C0103 + writer = csv.writer(fp) + + for node in self.nodes.values(): + # juggle the serialized IRIs + iri: typing.Optional[ str ] = None + + if node.kind in [ NodeEnum.ENT, NodeEnum.LEM ]: + if node.pos not in [ "VERB" ]: + iri = f"{self.iri_base}entity/{node.key.replace(' ', '_').replace('.', '_')}" # pylint: disable=C0301 + elif node.kind == NodeEnum.IRI: + iri = node.key + + if iri is not None: + incl_nodes.add(node.node_id) + + node_row: list = [ + node.node_id, + iri, + node.weight, + str(node.kind), + node.key, + node.label, + node.text, + node.pos, + node.length, + node.count, + ] + + if debug: + ic(node_row) + + writer.writerow(node_row) + + zip_fp.write( + nodes_path, + arcname = subdir + "/" + nodes_path.name, + ) + + # write the edges table + edges_path: pathlib.Path = pathlib.Path(zip_dir.name) / "edges.csv" + + with open(edges_path, "w", encoding = "utf-8") as fp: # pylint: disable=C0103 + writer = csv.writer(fp) + + for edge in self.edges.values(): + if edge.src_node in incl_nodes and edge.dst_node in incl_nodes: + edge_row: list = [ + edge.src_node, + edge.dst_node, + edge.rel, + edge.prob, + str(edge.kind), + edge.count, + ] + + if debug: + ic(edge_row) + + writer.writerow(edge_row) + + zip_fp.write( + edges_path, + arcname = subdir + "/" + edges_path.name, + ) + + # write the `demo.py` script + demo_str: str = """ +# minimal dependencies +import kuzu +import shutil + +# clear space for tables +DB_DIR: str = "db" +shutil.rmtree(DB_DIR, ignore_errors = True) + +# instantiate KùzuDB connection +db = kuzu.Database(DB_DIR) +conn = kuzu.Connection(db) + +# define table schema +conn.execute( + "CREATE NODE TABLE subobj(id INT64, iri STRING, prob FLOAT, kind STRING, lemma STRING, label STRING, descrip STRING, pos STRING, length INT16, count INT16, PRIMARY KEY (id))" +) +conn.execute( + "CREATE REL TABLE triple(FROM subobj TO subobj, pred STRING, prob FLOAT, kind STRING, count INT16)" +) + +# load data into tables +conn.execute('COPY subobj FROM "nodes.csv"') +conn.execute('COPY triple FROM "edges.csv"') + +# run a simple Cypher query +query: str = "MATCH (s:subobj) RETURN s.id, s.iri;" +results = conn.execute(query) + +while results.has_next(): + print(results.get_next()) + """ + + zip_fp.writestr( + subdir + "/demo.py", + demo_str, + ) + + if debug: + zip_fp.printdir() + + shutil.rmtree(zip_dir.name) + + return zip_name diff --git a/textgraphs/elem.py b/textgraphs/elem.py new file mode 100644 index 0000000000000000000000000000000000000000..fec4acf49a5724c381476d610af461b27fde599d --- /dev/null +++ b/textgraphs/elem.py @@ -0,0 +1,217 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +These classes represent graph elements. + +Consider this "flavor" of graph representation to be a superset of +`openCypher` _labeled property graphs_ (LPG) with additional support +for probabilistic graphs. + +Imposing a discipline of IRIs for node names and edge relations +helps guarantee that a view of the graph can be exported to RDF +for data quality checks, transitive closure, semantic inference, +and so on. + +see copyright/license https://huggingface.co/spaces/DerwenAI/textgraphs/blob/main/README.md +""" + +from dataclasses import dataclass, field +import typing + +import spacy # pylint: disable=E0401 + +from .util import EnumBase + + +###################################################################### +## class definitions + +@dataclass(order=False, frozen=False) +class KGSearchHit: # pylint: disable=R0902 + """ +A data class representing a hit from a _knowledge graph_ search. + """ + iri: str + label: str + descrip: str + aliases: typing.List[ str ] + prob: float + + +@dataclass(order=False, frozen=False) +class LinkedEntity: # pylint: disable=R0902 + """ +A data class representing one linked entity. + """ + span: typing.Optional[ spacy.tokens.span.Span ] + iri: str + length: int + rel: str + prob: float + token_id: int + kg_ent: typing.Optional[ KGSearchHit ] + count: int = 1 + + +@dataclass(order=False, frozen=False) +class NounChunk: # pylint: disable=R0902 + """ +A data class representing one noun chunk, i.e., a candidate as an extracted phrase. + """ + span: spacy.tokens.span.Span + text: str + length: int + lemma_key: str + unseen: bool + sent_id: int + start: int = 0 + + +class NodeEnum (EnumBase): + """ +Enumeration for the kinds of node categories + """ + DEP = 0 # `spaCy` parse dependency + LEM = 1 # lemmatized token + ENT = 2 # named entity + CHU = 3 # noun chunk + IRI = 4 # IRI for linked entity + + @property + def decoder ( + self + ) -> typing.List[ str ]: + """ +Decoder values + """ + return [ + "dep", + "lem", + "ent", + "chu", + "iri", + ] + + +@dataclass(order=False, frozen=False) +class Node: # pylint: disable=R0902 + """ +A data class representing one node, i.e., an extracted phrase. + """ + node_id: int + key: str + text: str + pos: str + kind: NodeEnum + span: typing.Optional[ typing.Union[ spacy.tokens.span.Span, spacy.tokens.token.Token ]] = None + loc: typing.List[ typing.List[ int ] ] = field(default_factory = lambda: []) + label: typing.Optional[ str ] = None + length: int = 1 + sub_obj: bool = False + count: int = 0 + neighbors: int = 0 + weight: float = 0.0 + entity: typing.List[ LinkedEntity ] = field(default_factory = lambda: []) + annotated: bool = False + + + def get_linked_label ( + self + ) -> typing.Optional[ str ]: + """ +When this node has a linked entity, return that IRI. +Otherwise return its `label` value. + + returns: +a label for the linked entity + """ + if len(self.entity) > 0: + return self.entity[0].iri + + return self.label + + + def get_name ( + self + ) -> str: + """ +Return a brief name for the graphical depiction of this Node. + + returns: +brief label to be used in a graph + """ + if self.kind == NodeEnum.IRI: + return self.label # type: ignore + if self.kind == NodeEnum.LEM: + return self.key + + return self.text + + + def get_stacked_count ( + self + ) -> int: + """ +Return a modified count, to redact verbs and linked entities from +the stack-rank partitions. + + returns: +count, used for re-ranking extracted entities + """ + if self.pos == "VERB" or self.kind == NodeEnum.IRI: + return 0 + + return self.count + + + def get_pos ( + self + ) -> typing.Tuple[ int, int ]: + """ +Generate a position span for `OpenNRE`. + + returns: +a position span needed for `OpenNRE` relation extraction + """ + position: typing.Tuple[ int, int ] = ( self.span.idx, self.span.idx + len(self.text) - 1, ) # type: ignore # pylint: disable=C0301 + return position + + +class RelEnum (EnumBase): + """ +Enumeration for the kinds of edge relations + """ + DEP = 0 # `spaCy` parse dependency + CHU = 1 # `spaCy` noun chunk + INF = 2 # `REBEL` or `OpenNRE` inferred relation + SYN = 3 # `sense2vec` inferred synonym + IRI = 4 # `DBPedia` or `Wikidata` linked entity + + @property + def decoder ( + self + ) -> typing.List[ str ]: + """ +Decoder values + """ + return [ + "dep", + "chu", + "inf", + "syn", + "iri", + ] + + +@dataclass(order=False, frozen=False) +class Edge: + """ +A data class representing an edge between two nodes. + """ + src_node: int + dst_node: int + kind: RelEnum + rel: str + prob: float + count: int = 1 diff --git a/textgraphs/gor.py b/textgraphs/gor.py new file mode 100644 index 0000000000000000000000000000000000000000..42bd121ffe477f74d552e62f852676ebfa42a312 --- /dev/null +++ b/textgraphs/gor.py @@ -0,0 +1,588 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +This class handles toplogical transforms of graph data into a +_graph of relations_ dual representation. + +see copyright/license https://huggingface.co/spaces/DerwenAI/textgraphs/blob/main/README.md +""" + +from collections import Counter, defaultdict +from dataclasses import dataclass, field +import itertools +import pathlib +import json +import sys +import typing + +from icecream import ic # pylint: disable=E0401 +import networkx as nx # pylint: disable=E0401 +import pandas as pd # pylint: disable=E0401 +import pyvis # pylint: disable=E0401 + +from .elem import Edge, Node, NodeEnum, RelEnum +from .graph import SimpleGraph +from .util import EnumBase + + +###################################################################### +## class definitions + +class RelDir (EnumBase): + """ +Enumeration for the directions of a relation. + """ + HEAD = 0 # relation flows into node + TAIL = 1 # relation flows out of node + + @property + def decoder ( + self + ) -> typing.List[ str ]: + """ +Decoder values + """ + return [ + "head", + "tail", + ] + + +@dataclass(order=False, frozen=False) +class SheafSeed: + """ +A data class representing a node from the source graph plus its +partial edge, based on a _Sheaf Theory_ decomposition of a graph. + """ + node_id: int + rel_id: int + rel_dir: RelDir + edge: Edge + + +@dataclass(order=False, frozen=False) +class TransArc: + """ +A data class representing one transformed rel-node-rel triple in +a _graph of relations_. + """ + pair_key: tuple + a_rel: int + b_rel: int + node_id: int + a_dir: RelDir + b_dir: RelDir + + +@dataclass(order=False, frozen=False) +class Affinity: + """ +A data class representing the affinity scores from one entity +in the transformed _graph of relations_. + +NB: there are much more efficient ways to calculate these +_affinity scores_ using sparse tensor algebra; this approach +illustrates the process -- for research and debugging. + """ + pairs: typing.Dict[ int, Counter ] = field(default_factory = lambda: defaultdict(Counter)) + scores: typing.Dict[ int, float ] = field(default_factory = lambda: {}) + tally: int = 0 + + +class GraphOfRelations: # pylint: disable=R0902 + """ +Attempt to reproduce results published in +"INGRAM: Inductive Knowledge Graph Embedding via Relation Graphs" + + """ + + def __init__ ( + self, + source: SimpleGraph + ) -> None: + """ +Constructor. + + source: +source graph to be transformed + """ + self.source: SimpleGraph = source + self.rel_list: typing.List[ str ] = [] + + self.node_list: typing.List[ Node ] = [] + self.edge_list: typing.List[ Edge ] = [] + + self.seed_links: typing.Dict[ int, list ] = defaultdict(list) + + self.head_affin: typing.Dict[ int, Affinity ] = defaultdict(Affinity) + self.tail_affin: typing.Dict[ int, Affinity ] = defaultdict(Affinity) + + # to be loaded from the dataset + self.pub_score: typing.Dict[ tuple, float ] = {} + + + def load_ingram ( # pylint: disable=R0914 + self, + json_file: pathlib.Path, + *, + debug: bool = False, + ) -> None: + """ +Load data for a source graph, as illustrated in _lee2023ingram_ + + json_file: +path for the JSON dataset to load + + debug: +debugging flag + """ + with open(json_file, "r", encoding = "utf-8") as fp: # pylint: disable=C0103,W0621 + dat: dict = json.load(fp) + + # JSON file provides an ordered list of relations + # to simplify tracing/debugging + self.rel_list = dat["rels"] + + # build the src node of the triple + for src_name, links in dat["ents"].items(): + src_node: Node = self.source.make_node( + [], + src_name, + None, + NodeEnum.ENT, + 0, + 0, + 0, + ) + + for rel_name, dst_name in links: + # error-check input + if rel_name not in self.rel_list: + print("Unknown relation:", rel_name) + sys.exit(-1) + + # build the dst node of the triple + dst_node: Node = self.source.make_node( + [], + dst_name, + None, + NodeEnum.ENT, + 0, + 0, + 0, + ) + + # create an edge between src/dst + edge: Edge = self.source.make_edge( # type: ignore # pylint: disable=W0612,W0621 + src_node, + dst_node, + RelEnum.SYN, + rel_name, + 1.0, + ) + + # load the expected score values + for rel_a, rel_b, score in dat["scores"]: + pair_key: tuple = (rel_a, rel_b) + self.pub_score[pair_key] = score + + if debug: + print(self.source.nodes) + print(self.source.edges) + print(self.rel_list) + print(self.pub_score) + + + def seeds ( + self, + *, + debug: bool = False, + ) -> None: + """ +Prep data for the topological transform illustrated in _lee2023ingram_ + + debug: +debugging flag + """ + self.node_list = list(self.source.nodes.values()) + self.edge_list = list(self.source.edges.values()) + + if debug: + print("\n--- triples in source graph ---") + + for edge in self.source.edges.values(): + if edge.rel not in self.rel_list: + self.rel_list.append(edge.rel) + + rel_id: int = self.rel_list.index(edge.rel) + + if debug: + ic(edge.src_node, rel_id, edge.dst_node) + print("", self.node_list[edge.src_node].text, edge.rel, self.node_list[edge.dst_node].text) # pylint: disable=C0301 + + # enumerate the partially decoupled links ("seeds") + # for the topological transform: + self.seed_links[edge.dst_node].append(SheafSeed( + edge.dst_node, + rel_id, + RelDir.HEAD, + edge, + )) + + self.seed_links[edge.src_node].append(SheafSeed( + edge.src_node, + rel_id, + RelDir.TAIL, + edge, + )) + + + def trace_source_graph ( + self + ) -> None: + """ +Output a "seed" representation of the source graph. + """ + print("\n--- nodes in source graph ---") + + for node in self.source.nodes.values(): + # CONFIRMED: correct according to examples in the paper + print(f"n: {node.node_id:2}, {node.text}") + + head_edges = [ + ( seed.edge.src_node, seed.edge.rel, seed.edge.dst_node, ) + for seed in self.seed_links[node.node_id] + if seed.rel_dir == RelDir.HEAD + ] + + print("", "head:", head_edges) + + tail_edges = [ + ( seed.edge.src_node, seed.edge.rel, seed.edge.dst_node, ) + for seed in self.seed_links[node.node_id] + if seed.rel_dir == RelDir.TAIL + ] + + print("", "tail:", tail_edges) + + print("\n--- edges in source graph ---") + + for rel_id, rel in enumerate(self.rel_list): + print(f"e: {rel_id:2}, {rel}") + + + def _transformed_triples ( + self, + *, + debug: bool = False, + ) -> typing.Iterator[ TransArc ]: + """ +Generate the transformed triples for a _graph of relations_. + + debug: +debugging flag + + yields: +transformed triples + """ + for node_id, seeds in sorted(self.seed_links.items()): + if debug: + ic(node_id, len(seeds)) + + for seed_a, seed_b in itertools.combinations(seeds, 2): + pair_key: tuple = tuple(sorted([ seed_a.rel_id, seed_b.rel_id ])) + + if debug: + print(f" {pair_key} {seed_a.edge.rel}.{seed_a.rel_dir} {self.node_list[node_id].text} {seed_b.edge.rel}.{seed_b.rel_dir}") # pylint: disable=C0301 + + trans_arc: TransArc = TransArc( + pair_key, + seed_a.rel_id, + seed_b.rel_id, + node_id, + seed_a.rel_dir, + seed_b.rel_dir, + ) + + yield trans_arc + + + def construct_gor ( + self, + *, + debug: bool = False, + ) -> None: + """ +Perform the topological transform described by _lee2023ingram_, +constructing a _graph of relations_ (GOR) and calculating +_affinity scores_ between entities in the GOR based on their +definitions: + +> we measure the affinity between two relations by considering how many +entities are shared between them and how frequently they share the same +entity + + debug: +debugging flag + """ + if debug: + print("\n--- transformed triples ---") + + for trans_arc in self._transformed_triples(debug = debug): + if debug: + ic(trans_arc) + print() + + if trans_arc.a_dir == RelDir.HEAD: + self.head_affin[trans_arc.a_rel].pairs[trans_arc.b_rel][trans_arc.node_id] += 1 + else: + self.tail_affin[trans_arc.a_rel].pairs[trans_arc.b_rel][trans_arc.node_id] += 1 + + if trans_arc.b_dir == RelDir.HEAD: + self.head_affin[trans_arc.b_rel].pairs[trans_arc.a_rel][trans_arc.node_id] += 1 + else: + self.tail_affin[trans_arc.b_rel].pairs[trans_arc.a_rel][trans_arc.node_id] += 1 + + + @classmethod + def tally_frequencies ( + cls, + counter: Counter, + ) -> int: + """ +Tally the frequency of shared entities. + + counter: +`counter` data collection for the rel_b/entity pairs + + returns: +tallied values for one relation + """ + sum_freq: int = counter.total() # type: ignore + + for occur in counter.values(): # pylint: disable=W0612 + sum_freq += 1 + + return sum_freq + + + def _collect_tallies ( + self, + *, + debug: bool = False, + ) -> None: + """ +Collect tallies, in preparation for calculating the affinity scores. + + debug: +debugging flag + """ + if debug: + print("\n--- collect shared entity tallies ---") + + for rel_a, rel in enumerate(self.rel_list): + for rel_b, counter in sorted(self.head_affin[rel_a].pairs.items()): + tally: int = self.tally_frequencies(counter) + self.head_affin[rel_a].scores[rel_b] = float(tally) + self.head_affin[rel_a].tally += tally + + for rel_b, counter in sorted(self.tail_affin[rel_a].pairs.items()): + tally = self.tally_frequencies(counter) + self.tail_affin[rel_a].scores[rel_b] = float(tally) + self.tail_affin[rel_a].tally += tally + + if debug: + print(rel_a, rel) + print(" h:", self.head_affin[rel_a].tally, self.head_affin[rel_a].scores.items()) + print(" t:", self.tail_affin[rel_a].tally, self.tail_affin[rel_a].scores.items()) + + + def get_affinity_scores ( + self, + *, + debug: bool = False, + ) -> typing.Dict[ tuple, float ]: + """ +Reproduce metrics based on the example published in _lee2023ingram_ + + debug: +debugging flag + + returns: +the calculated affinity scores + """ + self._collect_tallies(debug = debug) + + scores: typing.Dict[ tuple, float ] = {} + n_rels: int = len(self.rel_list) + + pairs: typing.Set[ tuple ] = { + tuple(sorted([ rel_a, rel_b ])) + for rel_a in range(n_rels) + for rel_b in range(n_rels) + } + + for rel_a, rel_b in sorted(list(pairs)): + pair_affin: float = 0.0 + + if rel_b in self.head_affin and rel_a in self.tail_affin: + rel_a_sum = self.head_affin[rel_a].tally + self.tail_affin[rel_a].tally + a_contrib = self.tally_frequencies(self.head_affin[rel_b].pairs[rel_a]) + + rel_b_sum = self.head_affin[rel_b].tally + self.tail_affin[rel_b].tally + b_contrib = self.tally_frequencies(self.tail_affin[rel_a].pairs[rel_b]) + + pair_affin += (a_contrib / float(rel_a_sum)) + (b_contrib / float(rel_b_sum)) + + if rel_b in self.tail_affin and rel_a in self.head_affin: + rel_a_sum = self.head_affin[rel_a].tally + self.tail_affin[rel_a].tally + a_contrib = self.tally_frequencies(self.tail_affin[rel_b].pairs[rel_a]) + + rel_b_sum = self.head_affin[rel_b].tally + self.tail_affin[rel_b].tally + b_contrib = self.tally_frequencies(self.head_affin[rel_a].pairs[rel_b]) + + pair_affin += (a_contrib / float(rel_a_sum)) + (b_contrib / float(rel_b_sum)) + + if pair_affin > 0.0: + pair_key: tuple = tuple(sorted([ rel_a, rel_b ])) + scores[pair_key] = pair_affin / 2.0 + + return scores + + + def trace_metrics ( + self, + scores: typing.Dict[ tuple, float ], + ) -> pd.DataFrame: + """ +Compare the calculated affinity scores with results from a published +example. + + scores: +the calculated affinity scores between pairs of relations (i.e., observed values) + + returns: +a `pandas.DataFrame` where the rows compare expected vs. observed affinity scores + """ + df_compare: pd.DataFrame = pd.DataFrame.from_dict([ + { + "pair": pair_key, + "rel_a": self.rel_list[pair_key[0]], + "rel_b": self.rel_list[pair_key[1]], + "affinity": round(aff, 2), + "expected": self.pub_score.get(pair_key) + } + for pair_key, aff in sorted(scores.items()) + ]) + + return df_compare + + + def _build_nx_graph ( + self, + scores: typing.Dict[ tuple, float ], + ) -> nx.Graph: + """ +Construct a network representation of the _graph of relations_ +in `NetworkX` + + scores: +the calculated affinity scores between pairs of relations (i.e., observed values) + + returns: +a `networkx.Graph` representation of the transformed graph + """ + vis_graph: nx.Graph = nx.Graph() + + vis_graph.add_nodes_from([ + ( + rel_id, + { + "label": rel, + }, + ) + for rel_id, rel in enumerate(self.rel_list) + ]) + + vis_graph.add_edges_from([ + ( + rel_a, + rel_b, + { + "weight": affinity, + }, + ) + for (rel_a, rel_b), affinity in scores.items() + ]) + + return vis_graph + + + def render_gor_plt ( + self, + scores: typing.Dict[ tuple, float ], + ) -> None: + """ +Visualize the _graph of relations_ using `matplotlib` + + scores: +the calculated affinity scores between pairs of relations (i.e., observed values) + """ + vis_graph: nx.Graph = self._build_nx_graph(scores) + + node_labels: typing.Dict[ int, str ] = dict(enumerate(self.rel_list)) + + edge_labels: typing.Dict[ int, str ] = { + edge_id: str(round(vis_graph.edges[edge_id]["weight"], 2)) + for edge_id in vis_graph.edges + } + + pos: dict = nx.spring_layout( + vis_graph, + k = 2.0, + ) + + nx.draw_networkx( + vis_graph, + pos, + labels = node_labels, + with_labels = True, + node_color = "#eee", + edge_color = "#bbb", + font_size = 9, + ) + + nx.draw_networkx_edge_labels( + vis_graph, + pos, + edge_labels = edge_labels, + ) + + + def render_gor_pyvis ( + self, + scores: typing.Dict[ tuple, float ], + ) -> pyvis.network.Network: + """ +Visualize the _graph of relations_ interactively using `PyVis` + + scores: +the calculated affinity scores between pairs of relations (i.e., observed values) + + returns: +a `pyvis.networkNetwork` representation of the transformed graph + """ + pv_graph: pyvis.network.Network = pyvis.network.Network() + pv_graph.from_nx(self._build_nx_graph(scores)) + + for pv_edge in pv_graph.get_edges(): + pair_key: tuple = ( pv_edge["from"], pv_edge["to"], ) + aff: typing.Optional[ float ] = scores.get(pair_key) + + if aff is not None: + pv_edge["title"] = round(aff, 2) + pv_edge["label"] = round(aff, 2) + pv_edge["width"] = int(aff * 10.0) + + return pv_graph diff --git a/textgraphs/graph.py b/textgraphs/graph.py new file mode 100644 index 0000000000000000000000000000000000000000..f90d9f3ccfedb174f6184f5bc60d8302483bb909 --- /dev/null +++ b/textgraphs/graph.py @@ -0,0 +1,391 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=R0801 + +""" +This class implements a generic, in-memory graph data structure used +to represent the _lemma graph_. + +see copyright/license https://huggingface.co/spaces/DerwenAI/textgraphs/blob/main/README.md +""" + +from collections import OrderedDict +import json +import typing + +from icecream import ic # pylint: disable=E0401 +import networkx as nx # pylint: disable=E0401 +import spacy # pylint: disable=E0401 + +from .elem import Edge, LinkedEntity, Node, NodeEnum, RelEnum + + +###################################################################### +## class definitions + +class SimpleGraph: + """ +An in-memory graph used to build a `MultiDiGraph` in NetworkX. + """ + + def __init__ ( + self + ) -> None: + """ +Constructor. + """ + self.nodes: typing.Dict[ str, Node ] = OrderedDict() + self.edges: typing.Dict[ str, Edge ] = {} + self.lemma_graph: nx.MultiDiGraph = nx.MultiDiGraph() + + + def reset ( + self + ) -> None: + """ +Re-initialize the data structures, resetting all but the configuration. + """ + self.nodes = OrderedDict() + self.edges = {} + self.lemma_graph = nx.MultiDiGraph() + + + def make_node ( # pylint: disable=R0913,R0914 + self, + tokens: typing.List[ Node ], + key: str, + span: spacy.tokens.token.Token, + kind: NodeEnum, + text_id: int, + para_id: int, + sent_id: int, + *, + label: typing.Optional[ str ] = None, + length: int = 1, + linked: bool = True, + ) -> Node: + """ +Lookup and return a `Node` object. +By default, link matching keys into the same node. +Otherwise instantiate a new node if it does not exist already. + + tokens: +list of parsed tokens + + key: +lemma key (invariant) + + span: +token span for the parsed entity + + kind: +the kind of this `Node` object + + text_id: +text (top-level document) identifier + + para_id: +paragraph identitifer + + sent_id: +sentence identifier + + label: +node label (for a new object) + + length: +length of token span + + linked: +flag for whether this links to an entity + + returns: +the constructed `Node` object + """ + token_id: int = 0 + token_text: str = key + token_pos: str = "PROPN" + + if span is not None: + token_id = span.i + token_text = span.text + token_pos = span.pos_ + + location: typing.List[ int ] = [ # type: ignore + text_id, + para_id, + sent_id, + token_id, + ] + + if not linked: + # construct a placeholder node (stopwords) + # NB: omit locations + self.nodes[key] = Node( + len(self.nodes), + key, + span.text, + span.pos_, + kind, + span = span, + length = length, + ) + + elif key in self.nodes: + # link to previously constructed entity node + self.nodes[key].count += 1 + self.nodes[key].loc.append(location) + + # reset the span, if this node was loaded from a + # previous pipeline or from bootstrap definitions + if self.nodes[key].span is None: + self.nodes[key].span = span + + # construct a new node for entity or lemma + else: + self.nodes[key] = Node( + len(self.nodes), + key, + token_text, + token_pos, + kind, + span = span, + loc = [ location ], + label = label, + length = length, + count = 1, + ) + + node: Node = self.nodes.get(key) # type: ignore + + if kind not in [ NodeEnum.CHU, NodeEnum.IRI ]: + tokens.append(node) + + return node # type: ignore + + + def make_edge ( # pylint: disable=R0913 + self, + src_node: Node, + dst_node: Node, + kind: RelEnum, + rel: str, + prob: float, + *, + key: typing.Optional[ str ] = None, + debug: bool = False, + ) -> typing.Optional[ Edge ]: + """ +Lookup an edge, creating a new one if it does not exist already, +and increment the count if it does. + + src_node: +source node in the triple + + dst_node: +destination node in the triple + + kind: +the kind of this `Edge` object + + rel: +relation label + + prob: +probability of this `Edge` within the graph + + key: +lemma key (invariant); generate a key if this is not provided + + debug: +debugging flag + + returns: +the constructed `Edge` object; this may be `None` if the input parameters indicate skipping the edge + """ + if key is None: + key = ".".join([ + str(src_node.node_id), + str(dst_node.node_id), + rel.replace(" ", "_"), + str(kind.value), + ]) + + if debug: + ic(key) + + if key in self.edges: + self.edges[key].count += 1 + + elif src_node.node_id != dst_node.node_id: + # preclude cycles in the graph + self.edges[key] = Edge( + src_node.node_id, + dst_node.node_id, + kind, + rel, + prob, + ) + + if debug: + ic(self.edges.get(key)) + + return self.edges.get(key) + + + def dump_lemma_graph ( + self + ) -> str: + """ +Dump the _lemma graph_ as a JSON string in _node-link_ format, +suitable for serialization and subsequent use in JavaScript, +Neo4j, Graphistry, etc. + +Make sure to call beforehand: `TextGraphs.calc_phrase_ranks()` + + returns: +a JSON representation of the exported _lemma graph_ in +[_node-link_](https://networkx.org/documentation/stable/reference/readwrite/json_graph.html) +format + """ + # populate the optional node properties + for node in self.nodes.values(): + nx_node = self.lemma_graph.nodes[node.node_id] + nx_node["name"] = node.text + nx_node["kind"] = str(node.kind) + nx_node["subobj"] = node.sub_obj + nx_node["pos"] = node.pos + nx_node["loc"] = str(node.loc) + nx_node["length"] = node.length + nx_node["hood"] = node.neighbors + nx_node["anno"] = node.annotated + + # juggle the serialized IRIs + if node.kind in [ NodeEnum.IRI ]: + nx_node["iri"] = node.key + elif node.label is not None and node.label.startswith("http"): + nx_node["iri"] = node.label + else: + nx_node["iri"] = None + + # emulate a node-link format serialization, using the + # default `NetworkX.node_link_data()` property names + edge_list: typing.List[ dict ] = [] + + for src, dst, props in self.lemma_graph.edges.data(): + props["source"] = src + props["target"] = dst + edge_list.append(props) + + node_link: dict = { + "directed": True, + "multigraph": True, + "nodes": [ + props + for node_id, props in self.lemma_graph.nodes.data() + ], + "links": edge_list, + "graph": {} + } + + return json.dumps( + node_link, + sort_keys = True, + indent = 2, + separators = ( ",", ":" ), + ) + + + def load_lemma_graph ( # pylint: disable=R0914 + self, + json_str: str, + *, + debug: bool = False, + ) -> None: + """ +Load from a JSON string in +a JSON representation of the exported _lemma graph_ in +[_node-link_](https://networkx.org/documentation/stable/reference/readwrite/json_graph.html) +format + + debug: +debugging flag + """ + dat: dict = json.loads(json_str) + tokens: typing.List[ Node ] = [] + to_link: typing.Dict[ str, str ] = {} + + # deserialize the nodes + for nx_node in dat.get("nodes"): # type: ignore + if debug: + ic(nx_node) + + kind: NodeEnum = NodeEnum.decode(nx_node["kind"]) # type: ignore + label: typing.Optional[ str ] = nx_node["label"] + + if kind in [ NodeEnum.ENT ] and nx_node["iri"] is not None: + label = nx_node["iri"] + + node: Node = self.make_node( + tokens, + nx_node["lemma"], + None, + kind, + 0, + 0, + 0, + label = label, + length = nx_node["length"], + ) + + node.text = nx_node["name"] + node.pos = nx_node["pos"] + node.loc = eval(nx_node["loc"]) # pylint: disable=W0123 + node.count = int(nx_node["count"]) + node.neighbors = int(nx_node["hood"]) + node.annotated = nx_node["anno"] + + # note which `Node` objects need to have entities linked + if kind == NodeEnum.ENT and nx_node["iri"] is not None: + to_link[node.key] = nx_node["iri"] + + if debug: + ic(node) + + # re-link the entities + for src_key, cls_key in to_link.items(): + src_node: Node = self.nodes.get(src_key) # type: ignore + cls_node: Node = self.nodes.get(cls_key) # type: ignore + + src_node.entity.append( + LinkedEntity( + cls_node.span, + cls_node.label, # type: ignore + cls_node.length, + cls_node.pos, + cls_node.weight, + 0, + None, + ) + ) + + # deserialize the edges + node_list: typing.List[ Node ] = list(self.nodes.values()) + + for nx_edge in dat.get("links"): # type: ignore + if debug: + ic(nx_edge) + + edge: Edge = self.make_edge( # type: ignore + node_list[nx_edge["source"]], + node_list[nx_edge["target"]], + RelEnum.decode(nx_edge["kind"]), # type: ignore + nx_edge["title"], + float(nx_edge["prob"]), + key = nx_edge["lemma"], + ) + + edge.count = int(nx_edge["count"]) + + if debug: + ic(edge) diff --git a/textgraphs/kg.py b/textgraphs/kg.py new file mode 100644 index 0000000000000000000000000000000000000000..a4e46fc4912bb30b23ed6d6d76aed9211df6643c --- /dev/null +++ b/textgraphs/kg.py @@ -0,0 +1,1215 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=C0302 + +""" +This class provides a wrapper for access to a _knowledge graph_, which +then runs _entity linking_ and other functions in the pipeline. + +This could provide an interface to a graph database, such as Neo4j, +StarDog, KùzuDB, etc., or to an API. + +In this default case, we wrap services available via the WikiMedia APIs: + + * DBPedia: Spotlight, SPARQL, Search + * Wikidata: Search + +see copyright/license https://huggingface.co/spaces/DerwenAI/textgraphs/blob/main/README.md +""" + +from collections import OrderedDict +from difflib import SequenceMatcher +import http +import json +import time +import traceback +import typing +import urllib.parse + +from bs4 import BeautifulSoup # pylint: disable=E0401 +from icecream import ic # pylint: disable=E0401 +from qwikidata.linked_data_interface import get_entity_dict_from_api # pylint: disable=E0401 +import markdown2 # pylint: disable=E0401 +import rdflib # pylint: disable=E0401 +import requests # type: ignore # pylint: disable=E0401 +import spacy # pylint: disable=E0401 + +from .defaults import DBPEDIA_MIN_ALIAS, DBPEDIA_MIN_SIM, \ + DBPEDIA_SEARCH_API, DBPEDIA_SPARQL_API, DBPEDIA_SPOTLIGHT_API, \ + WIKIDATA_API +from .elem import Edge, KGSearchHit, LinkedEntity, Node, NodeEnum, RelEnum +from .graph import SimpleGraph +from .pipe import KnowledgeGraph, Pipeline, PipelineFactory + + +###################################################################### +## class definitions + +class KGWikiMedia (KnowledgeGraph): # pylint: disable=R0902,R0903 + """ +Manage access to WikiMedia-related APIs. + """ + NER_MAP: typing.Dict[ str, dict ] = OrderedDict({ + "CARDINAL": { + "iri": "http://dbpedia.org/resource/Cardinal_number", + "definition": "Numerals that do not fall under another type", + "label": "cardinal number", + }, + "DATE": { + "iri": "http://dbpedia.org/ontology/date", + "definition": "Absolute or relative dates or periods", + "label": "date", + }, + "EVENT": { + "iri": "http://dbpedia.org/ontology/Event", + "definition": "Named hurricanes, battles, wars, sports events, etc.", + "label": "event", + }, + "FAC": { + "iri": "http://dbpedia.org/ontology/Infrastructure", + "definition": "Buildings, airports, highways, bridges, etc.", + "label": "infrastructure", + }, + "GPE": { + "iri": "http://dbpedia.org/ontology/Country", + "definition": "Countries, cities, states", + "label": "country", + }, + "LANGUAGE": { + "iri": "http://dbpedia.org/ontology/Language", + "definition": "Any named language", + "label": "language", + }, + "LAW": { + "iri": "http://dbpedia.org/ontology/Law", + "definition": "Named documents made into laws", + "label": "law", + }, + "LOC": { + "iri": "http://dbpedia.org/ontology/Place", + "definition": "Non-GPE locations, mountain ranges, bodies of water", + "label": "place", + }, + "MONEY": { + "iri": "http://dbpedia.org/resource/Money", + "definition": "Monetary values, including unit", + "label": "money", + }, + "NORP": { + "iri": "http://dbpedia.org/ontology/nationality", + "definition": "Nationalities or religious or political groups", + "label": "nationality", + }, + "ORDINAL": { + "iri": "http://dbpedia.org/resource/Ordinal_number", + "definition": "Ordinal number, i.e., first, second, etc.", + "label": "ordinal number", + }, + "ORG": { + "iri": "http://dbpedia.org/ontology/Organisation", + "definition": "Companies, agencies, institutions, etc.", + "label": "organization", + }, + "PERCENT": { + "iri": "http://dbpedia.org/resource/Percentage", + "definition": "Percentage", + "label": "percentage", + }, + "PERSON": { + "iri": "http://dbpedia.org/ontology/Person", + "definition": "People, including fictional", + "label": "person", + }, + "PRODUCT": { + "iri": "http://dbpedia.org/ontology/product", + "definition": "Vehicles, weapons, foods, etc. (Not services)", + "label": "product", + }, + "QUANTITY": { + "iri": "http://dbpedia.org/resource/Quantity", + "definition": "Measurements, as of weight or distance", + "label": "quantity", + }, + "TIME": { + "iri": "http://dbpedia.org/ontology/time", + "definition": "Times smaller than a day", + "label": "time", + }, + "WORK OF ART": { + "iri": "http://dbpedia.org/resource/Work_of_art", + "definition": "Titles of books, songs, etc.", + "label": "work of art", + }, + }) + + NS_PREFIX: typing.Dict[ str, str ] = OrderedDict({ + "dbc": "http://dbpedia.org/resource/Category:", + "dbt": "http://dbpedia.org/resource/Template:", + "dbr": "http://dbpedia.org/resource/", + "yago":"http://dbpedia.org/class/yago/", + "dbd": "http://dbpedia.org/datatype/", + "dbo": "http://dbpedia.org/ontology/", + "dbp": "http://dbpedia.org/property/", + "units": "http://dbpedia.org/units/", + "dbpedia-commons": "http://commons.dbpedia.org/resource/", + "dbpedia-wikicompany": "http://dbpedia.openlinksw.com/wikicompany/", + "dbpedia-wikidata": "http://wikidata.dbpedia.org/resource/", + "wd": "http://www.wikidata.org/", + "wd_ent": "http://www.wikidata.org/entity/", + "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", + "schema": "https://schema.org/", + "owl": "http://www.w3.org/2002/07/owl#", + }) + + + def __init__ ( # pylint: disable=W0102 + self, + *, + spotlight_api: str = DBPEDIA_SPOTLIGHT_API, + dbpedia_search_api: str = DBPEDIA_SEARCH_API, + dbpedia_sparql_api: str = DBPEDIA_SPARQL_API, + wikidata_api: str = WIKIDATA_API, + ner_map: dict = NER_MAP, + ns_prefix: dict = NS_PREFIX, + min_alias: float = DBPEDIA_MIN_ALIAS, + min_similarity: float = DBPEDIA_MIN_SIM, + ) -> None: + """ +Constructor. + + spotlight_api: +`DBPedia Spotlight` API or equivalent local service + + dbpedia_search_api: +`DBPedia Search` API or equivalent local service + + dbpedia_sparql_api: +`DBPedia SPARQL` API or equivalent local service + + wikidata_api: +`Wikidata Search` API or equivalent local service + + ner_map: +named entity map for standardizing IRIs + + ns_prefix: +RDF namespace prefixes + + min_alias: +minimum alias probability threshold for accepting linked entities + + min_similarity: +minimum label similarity threshold for accepting linked entities + """ + self.spotlight_api: str = spotlight_api + self.dbpedia_search_api: str = dbpedia_search_api + self.dbpedia_sparql_api: str = dbpedia_sparql_api + self.wikidata_api: str = wikidata_api + self.ner_map: dict = ner_map + self.ns_prefix: dict = ns_prefix + self.min_alias: float = min_alias + self.min_similarity: float = min_similarity + + self.ent_cache: dict = {} + self.iri_cache: dict = {} + + self.markdowner = markdown2.Markdown() + + + def augment_pipe ( + self, + factory: PipelineFactory, + ) -> None: + """ +Encapsulate a `spaCy` call to `add_pipe()` configuration. + + factory: +a `PipelineFactory` used to configure components + """ + factory.aux_pipe.add_pipe( + "dbpedia_spotlight", + config = { + "dbpedia_rest_endpoint": self.spotlight_api, # type: ignore + }, + ) + + + def remap_ner ( + self, + label: typing.Optional[ str ], + ) -> typing.Optional[ str ]: + """ +Remap the OntoTypes4 values from NER output to more general-purpose IRIs. + + label: +input NER label, an `OntoTypes4` value + + returns: +an IRI for the named entity + """ + if label is None: + return None + + try: + iri: typing.Optional[ dict ] = self.ner_map.get(label) + + if iri is not None: + return iri["iri"] + + except TypeError as ex: + ic(ex) + print(f"unknown label: {label}") + + return None + + + def normalize_prefix ( + self, + iri: str, + *, + debug: bool = False, + ) -> str: + """ +Normalize the given IRI using the standard DBPedia namespace prefixes. + + iri: +input IRI, in fully-qualified domain representation + + debug: +debugging flag + + returns: +the compact IRI representation, using an RDF namespace prefix + """ + iri_parse: urllib.parse.ParseResult = urllib.parse.urlparse(iri) + + if debug: + ic(iri_parse) + + for prefix, ns_fqdn in self.ns_prefix.items(): + ns_parse: urllib.parse.ParseResult = urllib.parse.urlparse(ns_fqdn) + + if debug: + ic(prefix, ns_parse.netloc, ns_parse.path, ns_parse.fragment) + + if iri_parse.netloc == ns_parse.netloc and iri_parse.path.startswith(ns_parse.path): + if len(iri_parse.fragment) > 0: + return f"{prefix}:{iri_parse.fragment}" + + slug: str = iri_parse.path.replace(ns_parse.path, "") + return f"{prefix}:{slug}" + + # normalization failed + return iri + + + def perform_entity_linking ( + self, + graph: SimpleGraph, + pipe: Pipeline, + *, + debug: bool = False, + ) -> None: + """ +Perform _entity linking_ based on `DBPedia Spotlight` and other services. + + graph: +source graph + + pipe: +configured pipeline for the current document + + debug: +debugging flag + """ + # first pass: use "spotlight" API to markup text + iter_ents: typing.Iterator[ LinkedEntity ] = self._link_spotlight_entities( + pipe, + debug = debug + ) + + for link in iter_ents: + _ = self._make_link( + graph, + pipe, + link, + str(rdflib.RDF.type), + debug = debug, + ) + + _ = self._secondary_entity_linking( + graph, + pipe, + link, + debug = debug, + ) + + # second pass: use KG search on entities which weren't linked by Spotlight + iter_ents = self._link_kg_search_entities( + pipe, + debug = debug, + ) + + for link in iter_ents: + _ = self._make_link( + graph, + pipe, + link, + str(rdflib.RDF.type), + debug = debug, + ) + + _ = self._secondary_entity_linking( + graph, + pipe, + link, + debug = debug, + ) + + + def resolve_rel_iri ( + self, + rel: str, + *, + lang: str = "en", + debug: bool = False, + ) -> typing.Optional[ str ]: + """ +Resolve a `rel` string from a _relation extraction_ model which has +been trained on this _knowledge graph_, which defaults to using the +`WikiMedia` graphs. + + rel: +relation label, generation these source from Wikidata for many RE projects + + lang: +language identifier + + debug: +debugging flag + + returns: +a resolved IRI + """ + # first, check the cache + if rel in self.iri_cache: + return self.iri_cache.get(rel) + + # otherwise construct a Wikidata API search + try: + hit: dict = self._wikidata_endpoint( + rel, + search_type = "property", + lang = lang, + debug = debug, + ) + + if debug: + ic(hit["label"], hit["id"]) + + # get the `claims` of the Wikidata property + prop_id: str = hit["id"] + prop_dict: dict = get_entity_dict_from_api(prop_id) + claims: dict = prop_dict["claims"] + + if "P1628" in claims: + # use `equivalent property` if available + iri: str = claims["P1628"][0]["mainsnak"]["datavalue"]["value"] + elif "P2235" in claims: + # use `external superproperty` as a fallback + iri = claims["P2235"][0]["mainsnak"]["datavalue"]["value"] + else: + ic("no related claims", rel) + return None + + if debug: + ic(iri) + + # update the cache + self.iri_cache[rel] = iri + return iri + + except requests.exceptions.ConnectionError as r_ex: + ic(r_ex) + return None + except Exception as ex: # pylint: disable=W0718 + ic(ex) + traceback.print_exc() + return None + + + ###################################################################### + ## private methods, customized per KG instance + + def _wikidata_endpoint ( + self, + query: str, + *, + search_type: str = "item", + lang: str = "en", + debug: bool = False, + ) -> dict: + """ +Call a generic endpoint for Wikidata API. +Raises various untrapped exceptions, to be handled by caller. + + query: +query string + + search_type: +search type + + lang: +language identifier + + debug: +debugging flag + """ + hit: dict = {} + + params: dict = { + "action": "wbsearchentities", + "type": search_type, + "language": lang, + "format": "json", + "continue": "0", + "search": query, + } + + response: requests.models.Response = requests.get( + self.wikidata_api, + params = params, + verify = False, + headers = { + "Accept": "application/json", + }, + ) + + if debug: + ic(response.status_code) + + # check for API success + if http.HTTPStatus.OK == response.status_code: + dat: dict = response.json() + hit = dat["search"][0] + + #print(json.dumps(hit, indent = 2, sort_keys = True)) + + return hit + + + @classmethod + def _match_aliases ( + cls, + query: str, + label: str, + aliases: typing.List[ str ], + *, + debug: bool = False, + ) -> typing.Tuple[ float, str ]: + """ +Find the best-matching aliases for a search term. + + query: +query string + + label: +entity label to be matched against the available aliases + + aliases: +list of the available aliases + + debug: +debugging flag + """ + # best case scenario: the label is an exact match + if query == label.lower(): + return ( 1.0, label, ) + + # ...therefore the label is not an exact match + prob_list: typing.List[ typing.Tuple[ float, str ]] = [ + ( SequenceMatcher(None, query, label.lower()).ratio(), label, ) + ] + + # fallback: test the aliases + for alias in aliases: + prob: float = SequenceMatcher(None, query, alias.lower()).ratio() + + if prob == 1.0: + # early termination for success + return ( prob, alias, ) + + prob_list.append(( prob, alias, )) + + # find the closest match + prob_list.sort(reverse = True) + + if debug: + ic(prob_list) + + return prob_list[0] + + + def _md_to_text ( + self, + md_text: str, + ) -> str: + """ +Convert markdown to plain text. + + + md_text: +markdown text (unrendered) + + returns: +rendered plain text as a string + """ + soup: BeautifulSoup = BeautifulSoup( + self.markdowner.convert(md_text), + features = "html.parser", + ) + + return soup.get_text().strip() + + + def wikidata_search ( + self, + query: str, + *, + lang: str = "en", + debug: bool = False, + ) -> typing.Optional[ KGSearchHit ]: + """ +Query the Wikidata search API. + + query: +query string + + lang: +language identifier + + debug: +debugging flag + + returns: +search hit, if any + """ + try: + hit: dict = self._wikidata_endpoint( + query, + search_type = "item", + lang = lang, + debug = debug, + ) + + # extract the needed properties + url: str = hit["concepturi"] + label: str = hit["label"] + descrip: str = hit["description"] + + # determine match likelihood + prob, _ = self._match_aliases( + query.lower(), + label, + [], + debug = debug, + ) + + if debug: + ic(query, url, label, descrip, prob) + + # return a linked entity + wiki_ent: KGSearchHit = KGSearchHit( + url, + label, + descrip, + [], + prob, + ) + + return wiki_ent + + except requests.exceptions.ConnectionError as r_ex: + ic(r_ex) + except Exception as ex: # pylint: disable=W0718 + ic(ex) + traceback.print_exc() + + return None + + + def dbpedia_search_entity ( # pylint: disable=R0914 + self, + query: str, + *, + lang: str = "en", + debug: bool = False, + ) -> typing.Optional[ KGSearchHit ]: + """ +Perform a DBPedia API search. + + query: +query string + + lang: +language identifier + + debug: +debugging flag + + returns: +search hit, if any + """ + # first, check the cache + key: str = "dbpedia:" + query.lower() + + if key in self.ent_cache: + return self.ent_cache.get(key) + + params: dict = { + "format": "json", + "language": lang, + "query": query, + } + + try: + response: requests.models.Response = requests.get( + self.dbpedia_search_api, + params = params, + verify = False, + headers = { + "Accept": "application/json", + }, + ) + + if debug: + ic(response.status_code) + + # check for failed API calls + if http.HTTPStatus.OK != response.status_code: + return None + + dat: dict = response.json() + hit: dict = dat["docs"][0] + + if debug: + ic(json.dumps(hit, indent = 2)) + + iri: str = hit["resource"][0] + label: str = self._md_to_text(hit["label"][0]) + descrip: str = self._md_to_text(hit["comment"][0]) + + aliases: typing.List[ str ] = [ + self._md_to_text(alias) + for alias in hit["redirectlabel"] + ] + + prob, best_match = self._match_aliases( + query.lower(), + label, + aliases, + debug = debug, + ) + + if debug: + ic(iri, label, descrip, aliases, prob, best_match) + + ent: KGSearchHit = KGSearchHit( + iri, + label, + descrip, + aliases, + prob, + ) + + # update the cache + self.ent_cache[key] = ent + return ent + + except requests.exceptions.ConnectionError as r_ex: + ic(r_ex) + return None + except Exception as ex: # pylint: disable=W0718 + ic(ex) + traceback.print_exc() + return None + + + def dbpedia_sparql_query ( + self, + sparql: str, + *, + debug: bool = False, + ) -> dict: + """ +Perform a SPARQL query on DBPedia. + + sparql: +SPARQL query string + + debug: +debugging flag + + returns: +dictionary of query results + """ + dat: dict = {} + + if debug: + print(sparql) + + params: dict = { + "query": sparql, + } + + try: + response: requests.models.Response = requests.get( + self.dbpedia_sparql_api, + params = params, + verify = False, + headers = { + "Accept": "application/json", + }, + ) + + if debug: + ic(response.status_code) + + # check for failed API calls + if http.HTTPStatus.OK == response.status_code: + dat = response.json() + + except requests.exceptions.ConnectionError as r_ex: + ic(r_ex) + except Exception as ex: # pylint: disable=W0718 + ic(ex) + traceback.print_exc() + + return dat + + + def dbpedia_wikidata_equiv ( + self, + dbpedia_iri: str, + *, + debug: bool = False, + ) -> typing.Optional[ str ]: + """ +Perform a SPARQL query on DBPedia to find an equivalent Wikidata entity. + + dbpedia_iri: +IRI in DBpedia + + debug: +debugging flag + + returns: +equivalent IRI in Wikidata + """ + # first, check the cache + if dbpedia_iri in self.iri_cache: + return self.iri_cache.get(dbpedia_iri) + + sparql: str = """ +SELECT DISTINCT ?wikidata_concept +WHERE {{ + {} owl:sameAs ?wikidata_concept . + FILTER(CONTAINS(STR(?wikidata_concept), "www.wikidata.org")) +}} +LIMIT 1000 + """.strip().replace("\n", " ").format(dbpedia_iri) + + dat: dict = self.dbpedia_sparql_query( + sparql, + debug = debug, + ) + + try: + hit: dict = dat["results"]["bindings"][0] + + if debug: + print(json.dumps(hit, indent = 2)) + + equiv_iri: str = hit["wikidata_concept"]["value"] + + if debug: + ic(equiv_iri) + + # update the cache + self.iri_cache[dbpedia_iri] = equiv_iri + return equiv_iri + + except Exception as ex: # pylint: disable=W0718 + ic(ex) + traceback.print_exc() + return None + + + ###################################################################### + ## entity linking + + def _link_spotlight_entities ( # pylint: disable=R0912,R0914 + self, + pipe: Pipeline, + *, + debug: bool = False, + ) -> typing.Iterator[ LinkedEntity ]: + """ +Iterator for the results of using `DBPedia Spotlight` to markup +text with _entity linking_ + + pipe: +configured pipeline for the current document + + debug: +debugging flag + + yields: +candidates linked entities + """ + ents: typing.List[ spacy.tokens.span.Span ] = [] + + if pipe.aux_doc is not None: + list(pipe.aux_doc.ents) + + if debug: + ic(ents) + + ent_idx: int = 0 + tok_idx: int = 0 + + for i, tok in enumerate(pipe.tokens): # pylint: disable=R1702 + if debug: + print() + ic(tok_idx, tok.text, tok.pos) + ic(ent_idx, len(ents)) + + if ent_idx < len(ents): + ent = ents[ent_idx] + + if debug: + ic(ent.start, tok_idx) + + if ent.start == tok_idx: + try: + if debug: + ic(ent.text, ent.start, len(ent)) + ic(ent.kb_id_, ent._.dbpedia_raw_result["@similarityScore"]) + ic(ent._.dbpedia_raw_result) + + prob: float = float(ent._.dbpedia_raw_result["@similarityScore"]) + count: int = int(ent._.dbpedia_raw_result["@support"]) + + if tok.pos == "PROPN" and prob >= self.min_similarity: + kg_ent: typing.Optional[ KGSearchHit ] = self.dbpedia_search_entity( # type: ignore # pylint: disable=C0301 + ent.text, + debug = debug, + ) + + if debug: + ic(kg_ent) + + if kg_ent is not None and kg_ent.prob > self.min_alias: # type: ignore + iri: str = ent.kb_id_ + + dbp_link: LinkedEntity = LinkedEntity( + ent, + iri, + len(ent), + "dbpedia", + prob, + i, + kg_ent, # type: ignore + count = count, + ) + + if debug: + ic("found", dbp_link) + + yield dbp_link + + except Exception as ex: # pylint: disable=W0718 + ic(ex) + traceback.print_exc() + + ent_idx += 1 + + tok_idx += tok.length + + + def _link_kg_search_entities ( + self, + pipe: Pipeline, + *, + debug: bool = False, + ) -> typing.Iterator[ LinkedEntity ]: + """ +Iterator for the results of using `DBPedia Search` directly for +_entity linking_. + + graph: +source graph + + pipe: +configured pipeline for the current document + + debug: +debugging flag + + yields: +search hits + """ + for i, node in enumerate(pipe.tokens): # pylint: disable=R1702 + if node.kind in [ NodeEnum.ENT ] and len(node.entity) < 1: + kg_ent: typing.Optional[ KGSearchHit ] = self.dbpedia_search_entity( # type: ignore # pylint: disable=C0301 + node.text, + debug = debug, + ) + + if kg_ent.prob > self.min_alias: # type: ignore + dbp_link: LinkedEntity = LinkedEntity( + node.span, + kg_ent.iri, # type: ignore + node.length, + "dbpedia", + kg_ent.prob, # type: ignore + i, + kg_ent, # type: ignore + ) + + if debug: + ic("found", dbp_link) + + yield dbp_link + + + def _make_link ( + self, + graph: SimpleGraph, + pipe: Pipeline, + link: LinkedEntity, + rel: str, + *, + debug: bool = False, + ) -> Node: + """ +Link to previously constructed entity node; +otherwise construct a new node for this linked entity. + + graph: +source graph + + pipe: +configured pipeline for the current document + + link: +entity to be linked + + rel: +relation label + + debug: +debugging flag + + returns: +the constructed `Node` object + """ + if debug: + ic(link) + + # special case of `make_node()` + if link.iri in graph.nodes: + graph.nodes[link.iri].count += 1 + + else: + graph.nodes[link.iri] = Node( + len(graph.nodes), + link.iri, + link.kg_ent.descrip, # type: ignore + rel, + NodeEnum.IRI, + span = link.span, + label = link.kg_ent.label, # type: ignore + length = link.length, + count = 1, + ) + + src_node: Node = pipe.tokens[link.token_id] + src_node.annotated = True + + dst_node: Node = graph.nodes.get(link.iri) # type: ignore + + if debug: + ic(src_node, dst_node) + + # back-link to the parsed entity object + pipe.tokens[link.token_id].entity.append(link) + + # construct a directed edge between them + edge: Edge = graph.make_edge( # type: ignore + src_node, + dst_node, + RelEnum.IRI, + rel, + link.prob, + debug = debug, + ) + + if debug: + ic(edge) + + if edge is not None: + pipe.edges.append(edge) + + # return the linked node + return dst_node + + + def _secondary_entity_linking ( + self, + graph: SimpleGraph, + pipe: Pipeline, + link: LinkedEntity, + *, + debug: bool = False, + ) -> typing.Optional[ Edge ]: + """ +Perform secondary _entity linking_, e.g., based on Wikidata API. + + graph: +source graph + + pipe: +configured pipeline for the current document + + link: +entity to be linked + + debug: +debugging flag + + returns: +the constructed `Edge` object + """ + wd_ent: typing.Optional[ KGSearchHit ] = self.wikidata_search( # type: ignore + link.kg_ent.label, # type: ignore + debug = debug, + ) + + if debug: + ic(link.span, wd_ent) + + if wd_ent is not None and wd_ent.prob > self.min_similarity: + wd_link: LinkedEntity = LinkedEntity( + link.span, + wd_ent.iri, + len(link.span), # type: ignore + "wikidata", + wd_ent.prob, + link.token_id, + wd_ent, + ) + + if debug: + ic(wd_link) + + src_node: Node = graph.nodes.get(link.iri) # type: ignore + + dst_node: Node = self._make_link( + graph, + pipe, + wd_link, + str(rdflib.RDF.type), + debug = debug, + ) + + # add an equivalency edge between the two linked entities + edge: Edge = graph.make_edge( # type: ignore + src_node, + dst_node, + RelEnum.IRI, + str(rdflib.OWL.sameAs), + wd_link.prob, + debug = debug, + ) + + if edge is not None: + pipe.edges.append(edge) + + # return the constructed edge + return edge + + return None + + +if __name__ == "__main__": + kg: KGWikiMedia = KGWikiMedia() + + ## resolve rel => iri + rel_list: typing.List[ str ] = [ + "country of citizenship", + "father", + "child", + "significant event", + "child", + "foo", + ] + + for test_rel in rel_list: + start_time: float = time.time() + + result: typing.Optional[ str ] = kg.resolve_rel_iri( + test_rel, + debug = True, + ) + + duration: float = round(time.time() - start_time, 3) + + ic(test_rel, result) + print(f"resolve: {round(duration, 3)} sec") + + ## search DBPedia + query_list: typing.List[ str ] = [ + "filmmaking", + "filmmaker", + "Werner Herzog", + "Werner Herzog", + "Werner", + "Marlene Dietrich", + "Dietrich", + "America", + ] + + for test_query in query_list: + start_time = time.time() + + _kg_ent: KGSearchHit = kg.dbpedia_search_entity( # type: ignore # pylint: disable=W0212 + test_query, + debug = True, + ) + + duration = round(time.time() - start_time, 3) + + ic(test_query, _kg_ent) + print(f"lookup: {round(duration, 3)} sec") + + + ## find Wikidata IRIs that correpond to DBPedia IRIs + dbp_iri_list: typing.List[ str ] = [ + "http://dbpedia.org/resource/Filmmaking", + "http://dbpedia.org/resource/Werner_Herzog", + "http://dbpedia.org/resource/United_States", + ] + + for dbp_iri in dbp_iri_list: + start_time = time.time() + + wd_iri: str = kg.dbpedia_wikidata_equiv( # pylint: disable=W0212 + kg.normalize_prefix(dbp_iri, debug = False), # type: ignore + debug = False, + ) + + duration = round(time.time() - start_time, 3) + + ic(dbp_iri, wd_iri) + print(f"query: {round(duration, 3)} sec") diff --git a/textgraphs/ner.py b/textgraphs/ner.py new file mode 100644 index 0000000000000000000000000000000000000000..32ed97a63c843a405dd507bafc0b6794cbc0f1c6 --- /dev/null +++ b/textgraphs/ner.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Classes for encapsulating NER models. + +see copyright/license https://huggingface.co/spaces/DerwenAI/textgraphs/blob/main/README.md +""" + +from .defaults import NER_MODEL +from .pipe import Component, PipelineFactory + + +###################################################################### +## class definitions + +class NERSpanMarker (Component): # pylint: disable=R0903 + """ +Configures a `spaCy` pipeline component for `SpanMarkerNER` + """ + + def __init__ ( + self, + *, + ner_model: str = NER_MODEL, + ) -> None: + """ +Constructor. + + ner_model: +model to be used in `SpanMarker` + """ + self.ner_model: str = ner_model + + + def augment_pipe ( + self, + factory: PipelineFactory, + ) -> None: + """ +Encapsulate a `spaCy` call to `add_pipe()` configuration. + + factory: +the `PipelineFactory` used to configure this pipeline component + """ + factory.tok_pipe.add_pipe( + "span_marker", + config = { + "model": self.ner_model, + }, + ) + + factory.ner_pipe.add_pipe( + "span_marker", + config = { + "model": self.ner_model, + }, + ) + + factory.aux_pipe.add_pipe( + "span_marker", + config = { + "model": self.ner_model, + }, + ) diff --git a/textgraphs/pipe.py b/textgraphs/pipe.py new file mode 100644 index 0000000000000000000000000000000000000000..bf76658f7d9e32c7d300d114c7178fe6bc14f362 --- /dev/null +++ b/textgraphs/pipe.py @@ -0,0 +1,536 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Leveraging a factory pattern for NLP pipelines. + +This class handles processing for one "chunk" of raw text input to +analyze, which is typically a paragraph. In other words, objects in +this class are expected to get recycled when processing moves on to +the next paragraph, to ease memory requirements. + +see copyright/license https://huggingface.co/spaces/DerwenAI/textgraphs/blob/main/README.md +""" + +from collections import OrderedDict +import abc +import asyncio +import functools +import itertools +import operator +import traceback +import typing + +from icecream import ic # pylint: disable=E0401,W0611 +import networkx as nx # pylint: disable=E0401 +import spacy # pylint: disable=E0401 + +from .defaults import SPACY_MODEL +from .elem import Edge, Node, NodeEnum, NounChunk +from .graph import SimpleGraph + + +###################################################################### +## class definitions + +class Component (abc.ABC): # pylint: disable=R0903 + """ +Abstract base class for a `spaCy` pipeline component. + """ + + @abc.abstractmethod + def augment_pipe ( + self, + factory: "PipelineFactory", + ) -> None: + """ +Encapsulate a `spaCy` call to `add_pipe()` configuration. + + factory: +a `PipelineFactory` used to configure components + """ + raise NotImplementedError + + +class KnowledgeGraph (Component): + """ +Base class for a _knowledge graph_ interface. + """ + NER_MAP: typing.Dict[ str, dict ] = OrderedDict({}) + NS_PREFIX: typing.Dict[ str, str ] = OrderedDict({}) + + + def augment_pipe ( + self, + factory: "PipelineFactory", + ) -> None: + """ +Encapsulate a `spaCy` call to `add_pipe()` configuration. + + factory: +a `PipelineFactory` used to configure components + """ + pass # pylint: disable=W0107 + + + def remap_ner ( + self, + label: typing.Optional[ str ], + ) -> typing.Optional[ str ]: + """ +Remap the OntoTypes4 values from NER output to more general-purpose IRIs. + + label: +input NER label, an `OntoTypes4` value + + returns: +an IRI for the named entity + """ + return label + + + def normalize_prefix ( + self, + iri: str, + *, + debug: bool = False, # pylint: disable=W0613 + ) -> str: + """ +Normalize the given IRI to use standard namespace prefixes. + + iri: +input IRI, in fully-qualified domain representation + + debug: +debugging flag + + returns: +the compact IRI representation, using an RDF namespace prefix + """ + return iri + + + def perform_entity_linking ( + self, + graph: SimpleGraph, + pipe: "Pipeline", + *, + debug: bool = False, + ) -> None: + """ +Perform _entity linking_ based on "spotlight" and other services. + + graph: +source graph + + pipe: +configured pipeline for the current document + + debug: +debugging flag + """ + pass # pylint: disable=W0107 + + + def resolve_rel_iri ( + self, + rel: str, + *, + lang: str = "en", # pylint: disable=W0613 + debug: bool = False, # pylint: disable=W0613 + ) -> typing.Optional[ str ]: + """ +Resolve a `rel` string from a _relation extraction_ model which has +been trained on this knowledge graph. + + rel: +relation label, generation these source from Wikidata for many RE projects + + lang: +language identifier + + debug: +debugging flag + + returns: +a resolved IRI + """ + return rel + + +class InferRel (abc.ABC): # pylint: disable=R0903 + """ +Abstract base class for a _relation extraction_ model wrapper. + """ + + @abc.abstractmethod + def gen_triples ( + self, + pipe: "Pipeline", + *, + debug: bool = False, + ) -> typing.Iterator[typing.Tuple[ Node, str, Node ]]: + """ +Infer relations as triples through a generator _iteratively_. + + pipe: +configured pipeline for the current document + + debug: +debugging flag + + yields: +generated triples + """ + raise NotImplementedError + + + async def gen_triples_async ( + self, + pipe: "Pipeline", + queue: asyncio.Queue, + *, + debug: bool = False, + ) -> None: + """ +Infer relations as triples produced to a queue _concurrently_. + + pipe: +configured pipeline for the current document + + queue: +queue of inference tasks to be performed + + debug: +debugging flag + """ + for src, iri, dst in self.gen_triples(pipe, debug = debug): + await queue.put(( src, iri, dst, )) + + +class Pipeline: # pylint: disable=R0902,R0903 + """ +Manage parsing of a document, which is assumed to be paragraph-sized. + """ + + def __init__ ( # pylint: disable=R0913 + self, + text_input: str, + tok_pipe: spacy.Language, + ner_pipe: spacy.Language, + aux_pipe: spacy.Language, + kg: KnowledgeGraph, # pylint: disable=C0103 + infer_rels: typing.List[ InferRel ], + ) -> None: + """ +Constructor. + + text_input: +raw text to be parsed + + tok_pipe: +the `spaCy.Language` pipeline used for tallying individual tokens + + ner_pipe: +the `spaCy.Language` pipeline used for tallying named entities + + aux_pipe: +the `spaCy.Language` pipeline used for auxiliary components (e.g., `DBPedia Spotlight`) + + kg: +knowledge graph used for entity linking + + infer_rels: +a list of components for inferring relations + """ + self.text: str = text_input + + # `tok_doc` provides a stream of individual tokens + self.tok_doc: spacy.tokens.Doc = tok_pipe(self.text) + + # `ner_doc` provides the merged-entity spans from NER + self.ner_doc: spacy.tokens.Doc = ner_pipe(self.text) + + # `aux_doc` e.g., re-indexing spans for Spotlight entity linking + # NB: this is optional, in case the Spotlight service is down + self.aux_doc: typing.Optional[ spacy.tokens.Doc ] = None + + try: + self.aux_doc = aux_pipe(self.text) + except Exception as ex: # pylint: disable=W0718 + ic(ex) + + self.kg: KnowledgeGraph = kg # pylint: disable=C0103 + self.infer_rels: typing.List[ InferRel ] = infer_rels + + # list of Node objects for each parsed token, in sequence + self.tokens: typing.List[ Node ] = [] + + # set of Edge objects generated by this Pipeline + self.edges: typing.List[ Edge ] = [] + + + @classmethod + def get_lemma_key ( + cls, + span: typing.Union[ spacy.tokens.span.Span, spacy.tokens.token.Token ], + *, + placeholder: bool = False, + ) -> str: + """ +Compose a unique, invariant lemma key for the given span. + + span: +span of tokens within the lemma + + placeholder: +flag for whether to create a placeholder + + returns: +a composed lemma key + """ + if isinstance(span, spacy.tokens.token.Token): + terms: typing.List[ str ] = [ + span.lemma_.strip().lower(), + span.pos_, + ] + + if placeholder: + terms.insert(0, str(span.i)) + + else: + terms = functools.reduce( + operator.iconcat, + [ + [ token.lemma_.strip().lower(), token.pos_, ] + for token in span + ], + [], + ) + + return ".".join(terms) + + + def get_ent_lemma_keys ( + self, + ) -> typing.Iterator[ typing.Tuple[ str, int ]]: + """ +Iterate through the fully qualified lemma keys for an extracted entity. + + yields: +the lemma keys within an extracted entity + """ + for ent in self.tok_doc.ents: + yield self.get_lemma_key(ent), len(ent) + + + def link_noun_chunks ( + self, + nodes: dict, + *, + debug: bool = False, + ) -> typing.List[ NounChunk ]: + """ +Link any noun chunks which are not already subsumed by named entities. + + nodes: +dictionary of `Node` objects in the graph + + debug: +debugging flag + + returns: +a list of identified noun chunks which are novel + """ + chunks: typing.List[ NounChunk ] = [] + + # first pass: note the available noun chunks + for sent_id, sent in enumerate(self.tok_doc.sents): + for span in sent.noun_chunks: + lemma_key: str = self.get_lemma_key(span) + + chunks.append( + NounChunk( + span, + span.text, + len(span), + lemma_key, + lemma_key not in nodes, + sent_id, + ) + ) + + # second pass: remap span indices to the merged entities pipeline + for i, span in enumerate(self.ner_doc.noun_chunks): + if span.text == self.tokens[span.start].text: + chunks[i].unseen = False + elif chunks[i].unseen: + chunks[i].start = span.start + + if debug: + ic(chunks[i]) + + return chunks + + + ###################################################################### + ## relation extraction + + def iter_entity_pairs ( + self, + pipe_graph: nx.MultiGraph, + max_skip: int, + *, + debug: bool = True, + ) -> typing.Iterator[ typing.Tuple[ Node, Node ]]: + """ +Iterator for entity pairs for which the algorithm infers relations. + + pipe_graph: +a `networkx.MultiGraph` representation of the graph, reused for graph algorithms + + max_skip: +maximum distance between entities for inferred relations + + debug: +debugging flag + + yields: +pairs of entities within a range, e.g., to use for relation extraction + """ + ent_list: typing.List[ Node ] = [ + node + for node in self.tokens + if node.kind in [ NodeEnum.ENT ] + ] + + for pair in itertools.product(ent_list, repeat = 2): + if pair[0] != pair[1]: + src: Node = pair[0] + dst: Node = pair[1] + + try: + path: typing.List[ int ] = nx.shortest_path( + pipe_graph, + source = src.node_id, + target = dst.node_id, + weight = "weight", + method = "dijkstra", + ) + + if debug: + ic(src.node_id, dst.node_id, path) + + if len(path) <= max_skip: + yield ( src, dst, ) + except nx.NetworkXNoPath: + pass + except Exception as ex: # pylint: disable=W0718 + ic(ex) + ic("ERROR", src, dst) + traceback.print_exc() + + +class PipelineFactory: # pylint: disable=R0903 + """ +Factory pattern for building a pipeline, which is one of the more +expensive operations with `spaCy` + """ + + def __init__ ( # pylint: disable=W0102 + self, + *, + spacy_model: str = SPACY_MODEL, + ner: typing.Optional[ Component ] = None, + kg: KnowledgeGraph = KnowledgeGraph(), # pylint: disable=C0103 + infer_rels: typing.List[ InferRel ] = [] + ) -> None: + """ +Constructor which instantiates the `spaCy` pipelines: + + * `tok_pipe` -- regular generator for parsed tokens + * `ner_pipe` -- with entities merged + * `aux_pipe` -- spotlight entity linking + +which will be needed for parsing and entity linking. + + spacy_model: +the specific model to use in `spaCy` pipelines + + ner: +optional custom NER component + + kg: +knowledge graph used for entity linking + + infer_rels: +a list of components for inferring relations + """ + self.ner: typing.Optional[ Component ] = ner + self.kg: KnowledgeGraph = kg # pylint: disable=C0103 + self.infer_rels: typing.List[ InferRel ] = infer_rels + + # determine the NER model to be used + exclude: typing.List[ str ] = [] + + if self.ner is not None: + exclude.append("ner") + + # build the pipelines + # NB: `spaCy` team doesn't quite get the PEP 621 restrictions which PyPa mangled: + # https://github.com/explosion/spaCy/issues/3536 + # https://github.com/explosion/spaCy/issues/4592#issuecomment-704373657 + if not spacy.util.is_package(spacy_model): + spacy.cli.download(spacy_model) + + self.tok_pipe = spacy.load( + spacy_model, + exclude = exclude, + ) + + self.ner_pipe = spacy.load( + spacy_model, + exclude = exclude, + ) + + self.aux_pipe = spacy.load( + spacy_model, + exclude = exclude, + ) + + # add NER + if self.ner is not None: + self.ner.augment_pipe(self) + + # `aux_pipe` only: entity linking + self.kg.augment_pipe(self) + + # `ner_pipe` only: merge entities + self.ner_pipe.add_pipe( + "merge_entities", + ) + + + def create_pipeline ( + self, + text_input: str, + ) -> Pipeline: + """ +Instantiate the document pipelines needed to parse the input text. + + text_input: +raw text to be parsed + + returns: +a configured `Pipeline` object + """ + pipe: Pipeline = Pipeline( + text_input, + self.tok_pipe, + self.ner_pipe, + self.aux_pipe, + self.kg, + self.infer_rels, + ) + + return pipe diff --git a/textgraphs/rel.py b/textgraphs/rel.py new file mode 100644 index 0000000000000000000000000000000000000000..e664c5b8571e93650e80938d1eca7d4722c5574e --- /dev/null +++ b/textgraphs/rel.py @@ -0,0 +1,324 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +These classes provide wrappers for _relation extraction_ models: + + * ThuNLP `OpenNRE` + * Babelscape `REBEL` + +see copyright/license https://huggingface.co/spaces/DerwenAI/textgraphs/blob/main/README.md +""" + +import typing + +from icecream import ic # pylint: disable=E0401 +import networkx as nx # pylint: disable=E0401 +import opennre # pylint: disable=E0401 +import transformers # pylint: disable=E0401 + +from .defaults import MAX_SKIP, MREBEL_MODEL, OPENNRE_MIN_PROB, OPENNRE_MODEL +from .elem import Node +from .pipe import InferRel, Pipeline + + +###################################################################### +## class definitions + +class InferRel_OpenNRE (InferRel): # pylint: disable=C0103,R0903 + """ +Perform relation extraction based on the `OpenNRE` model. + + """ + def __init__ ( + self, + *, + model: str = OPENNRE_MODEL, + max_skip: int = MAX_SKIP, + min_prob: float = OPENNRE_MIN_PROB, + ) -> None: + """ +Constructor. + + model: +the specific model to be used in `OpenNRE` + + max_skip: +maximum distance between entities for inferred relations + + min_prob: +minimum probability threshold for accepting an inferred relation + """ + self.max_skip: int = max_skip + self.min_prob: float = min_prob + + self.nre_pipeline: opennre.model.softmax_nn.SoftmaxNN = opennre.get_model(model) + + + def gen_triples ( + self, + pipe: Pipeline, + *, + debug: bool = False, + ) -> typing.Iterator[typing.Tuple[ Node, str, Node ]]: + """ +Iterate on entity pairs to drive `OpenNRE`, inferring relations +represented as triples which get produced by a generator. + + pipe: +configured pipeline for the current document + + debug: +debugging flag + + yields: +generated triples as candidates for inferred relations + """ + node_list: list = [ + node.node_id + for node in pipe.tokens + ] + + pipe_graph: nx.MultiGraph = nx.MultiGraph() + pipe_graph.add_nodes_from(node_list) + + pipe_graph.add_edges_from([ + ( edge.src_node, edge.dst_node, ) + for edge in pipe.edges + if edge is not None and edge.src_node in node_list and edge.dst_node in node_list + ]) + + for src, dst in pipe.iter_entity_pairs(pipe_graph, self.max_skip, debug = debug): + rel, prob = self.nre_pipeline.infer({ # type: ignore + "text": pipe.text, + "h": { "pos": src.get_pos() }, + "t": { "pos": dst.get_pos() }, + }) + + if prob >= self.min_prob: + if debug: + ic(src.text, dst.text) + ic(rel, prob) + + # use the knowledge graph to resolve the IRI + iri: typing.Optional[ str ] = pipe.kg.resolve_rel_iri( + rel, + ) + + if iri is None: + iri = "opennre:" + rel.replace(" ", "_") + + yield src, iri, dst + + +class InferRel_Rebel (InferRel): # pylint: disable=C0103,R0903 + """ +Perform relation extraction based on the `REBEL` model. + + + """ + + def __init__ ( + self, + *, + lang: str = "en_XX", + mrebel_model: str = MREBEL_MODEL, + ) -> None: + """ +Constructor. + + lang: +language identifier + + mrebel_model: +tokenizer model to be used + """ + self.lang = lang + + self.hf_pipeline: transformers.pipeline = transformers.pipeline( + "translation_xx_to_yy", + model = mrebel_model, + tokenizer = mrebel_model, + ) + + + def tokenize_sent ( + self, + text: str, + ) -> str: + """ +Apply the tokenizer manually, since we need to extract special tokens. + + text: +input text for the sentence to be tokenized + + returns: +extracted tokens + """ + tokenized: list = self.hf_pipeline( + text, + decoder_start_token_id = 250058, + src_lang = self.lang, + tgt_lang = "", + return_tensors = True, + return_text = False, + ) + + extracted: list = self.hf_pipeline.tokenizer.batch_decode([ + tokenized[0]["translation_token_ids"] + ]) + + return extracted[0] + + + def extract_triplets_typed ( + self, + text: str, + ) -> list: + """ +Parse the generated text and extract its triplets. + + text: +input text for the sentence to use in inference + + returns: +a list of extracted triples + """ + triplets: list = [] + current: str = "x" + subject: str = "" + subject_type: str = "" + relation: str = "" + object_: str = "" + object_type: str = "" + + text = text.strip()\ + .replace("", "")\ + .replace("", "")\ + .replace("", "")\ + .replace("tp_XX", "")\ + .replace("__en__", "") + + for token in text.split(): + if token in [ "", "" ]: + current = "t" + + if relation != "": + triplets.append({ + "head": subject.strip(), + "head_type": subject_type, + "type": relation.strip(), + "tail": object_.strip(), + "tail_type": object_type, + }) + + relation = "" + + subject = "" + + elif token.startswith("<") and token.endswith(">"): + if current in [ "t", "o" ]: + current = "s" + + if relation != "": + triplets.append({ + "head": subject.strip(), + "head_type": subject_type, + "type": relation.strip(), + "tail": object_.strip(), + "tail_type": object_type, + }) + + object_ = "" + subject_type = token[1:-1] + else: + current = "o" + object_type = token[1:-1] + relation = "" + + else: + if current == "t": + subject += " " + token + elif current == "s": + object_ += " " + token + elif current == "o": + relation += " " + token + + if subject != "" and relation != "" and object_ != "" and object_type != "" and subject_type != "": # pylint: disable=C0301 + triplets.append({ + "head": subject.strip(), + "head_type": subject_type, + "tail": object_.strip(), + "tail_type": object_type, + "rel": relation.strip(), + }) + + return triplets + + + def gen_triples ( + self, + pipe: Pipeline, + *, + debug: bool = False, + ) -> typing.Iterator[typing.Tuple[ Node, str, Node ]]: + """ +Drive `REBEL` to infer relations for each sentence, represented as +triples which get produced by a generator. + + pipe: +configured pipeline for the current document + + debug: +debugging flag + + yields: +generated triples as candidates for inferred relations + """ + for sent in pipe.ner_doc.sents: + extract: str = self.tokenize_sent(str(sent).strip()) + triples: typing.List[ dict ] = self.extract_triplets_typed(extract) + + tok_map: dict = { + token.text: pipe.tokens[token.i] + for token in sent + } + + if debug: + ic(extract, triples) + + for triple in triples: + src: typing.Optional[ Node ] = tok_map.get(triple["head"]) + dst: typing.Optional[ Node ] = tok_map.get(triple["tail"]) + rel: str = triple["rel"] + + if src is not None and dst is not None: + if debug: + ic(src, dst, rel) + + # use the knowledge graph to resolve the IRI + iri: typing.Optional[ str ] = pipe.kg.resolve_rel_iri( + rel, + ) + + if iri is None: + iri = "mrebel:" + rel.replace(" ", "_") + + yield src, iri, dst + + +if __name__ == "__main__": + _rebel: InferRel_Rebel = InferRel_Rebel() + + _para: list = [ + "Werner Herzog is a remarkable filmmaker and intellectual from Germany, the son of Dietrich Herzog.", # pylint: disable=C0301 + "After the war, Werner fled to America to become famous.", + "Instead, Herzog became President and decided to nuke Slovenia.", + ] + + for _sent in _para: + _extract: str = _rebel.tokenize_sent(_sent.strip()) + ic(_extract) + + _triples: list = _rebel.extract_triplets_typed(_extract) + ic(_triples) diff --git a/textgraphs/util.py b/textgraphs/util.py new file mode 100644 index 0000000000000000000000000000000000000000..f7cc9c7f089214dc41a17580de1c0f07f86054c5 --- /dev/null +++ b/textgraphs/util.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Utility functions for the `TextGraphs` library. + +see copyright/license https://huggingface.co/spaces/DerwenAI/textgraphs/blob/main/README.md +""" + +import enum +import math +import typing + +import numpy as np # type: ignore # pylint: disable=E0401 +import pandas as pd # type: ignore # pylint: disable=E0401 + + +###################################################################### +## class definitions + +class EnumBase (enum.IntEnum): + """ +A mixin for Enum codecs. + """ + + @property + def decoder ( + self + ) -> typing.List[ str ]: + """ +Property used for codec. + """ + return [ "xyzzy" ] + + + @classmethod + def decode ( + cls, + text: str, + ) -> enum.IntEnum: + """ +Codec for loading from a string. + + text: +string representation for the input value being decoded + """ + return cls[text.strip().upper()] + + + def __str__ ( + self + ) -> str: + """ +Codec for representing as a string. + + returns: +decoded string representation of the enumerated value + """ + return self.decoder[self.value] + + +###################################################################### +## utility functions + +def calc_quantile_bins ( + num_rows: int + ) -> np.ndarray: + """ +Calculate the bins to use for a quantile stripe, +using [`numpy.linspace`](https://numpy.org/doc/stable/reference/generated/numpy.linspace.html) + + num_rows: +number of rows in the target dataframe + + returns: +calculated bins, as a `numpy.ndarray` + """ + granularity = max(round(math.log(num_rows) * 4), 1) + + return np.linspace( + 0, + 1, + num = granularity, + endpoint = True, + ) + + +def stripe_column ( + values: list, + bins: int, + ) -> np.ndarray: + """ +Stripe a column in a dataframe, by interpolating quantiles into a set of discrete indexes. + + values: +list of values to stripe + + bins: +quantile bins; see [`calc_quantile_bins()`](#calc_quantile_bins-function) + + returns: +the striped column values, as a `numpy.ndarray` + """ + s = pd.Series(values) # pylint: disable=C0103 + q = s.quantile(bins, interpolation = "nearest") # pylint: disable=C0103 + + try: + stripe = np.digitize(values, q) - 1 + return stripe + except ValueError as ex: + # should never happen? + print("ValueError:", str(ex), values, s, q, bins) + raise + + +def root_mean_square ( + values: typing.List[ float ] + ) -> float: + """ +Calculate the [*root mean square*](https://mathworld.wolfram.com/Root-Mean-Square.html) +of the values in the given list. + + values: +list of values to use in the RMS calculation + + returns: +RMS metric as a float + """ + s: float = sum(map(lambda x: float(x)**2.0, values)) # pylint: disable=C0103 + n: float = float(len(values)) # pylint: disable=C0103 + + return math.sqrt(s / n) diff --git a/textgraphs/version.py b/textgraphs/version.py new file mode 100644 index 0000000000000000000000000000000000000000..867fa7093d1fcfa1eed6951f57e07207eec3927f --- /dev/null +++ b/textgraphs/version.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Describe the GitHub repo version tags and commit hash for +the `TextGraphs` library. + +see copyright/license https://huggingface.co/spaces/DerwenAI/textgraphs/blob/main/README.md +""" + +from os.path import dirname, abspath +import pathlib +import typing + +from git import Repo # pylint: disable=E0401 # type: ignore + + +## use the local Git info for version info, if available +REPO_HASH: str = "xxxxxxxxx" # default/placeholder +REPO_TAGS: str = "refs/tags/v1.0.0" # default/placeholder + +try: + repo_path: pathlib.Path = pathlib.Path(dirname(abspath(__file__))) + repo: Repo = Repo(repo_path.parents[0]) + + REPO_HASH = str(repo.head.commit) + REPO_TAGS = repo.tags +except Exception as ex: # pylint: disable=W0703 + print(ex) + + +# cast version string into a float +try: + v_seq: typing.List[ str ] = str(REPO_TAGS[-1]).replace("v", "").split(".")[:3] + + __version__ = ".".join(v_seq) # this is the OpenAPI documentation version + + __version_major__ = int(v_seq[0]) + __version_minor__ = int(v_seq[1]) + __version_patch__ = int(v_seq[2]) +except IndexError: + # the code above may fail in Github Actions workflow + __version__ = "0.0+test" + + __version_major__ = 0 + __version_minor__ = 0 + __version_patch__ = 0 + + +def get_repo_version ( + ) -> typing.Tuple[ str, str ]: + """ +Access the Git repository information and return items to identify +the version/commit running in production. + + returns: +version tag and commit hash + """ + return __version__, REPO_HASH diff --git a/textgraphs/vis.py b/textgraphs/vis.py new file mode 100644 index 0000000000000000000000000000000000000000..5b80c6df1186e9fd6cac7595cdfbc421d108027a --- /dev/null +++ b/textgraphs/vis.py @@ -0,0 +1,271 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=R0801 + +""" +Visualization methods based on `PyVis`, `wordcloud`, and so on. + +This class handles visualizations of graphs and graph elements. + +see copyright/license https://huggingface.co/spaces/DerwenAI/textgraphs/blob/main/README.md +""" + +from dataclasses import dataclass +import typing + +from icecream import ic # pylint: disable=E0401 +import matplotlib.colors as mcolors # pylint: disable=E0401 +import networkx as nx # pylint: disable=E0401 +import pyvis # pylint: disable=E0401 +import wordcloud # pylint: disable=E0401 + +from .elem import NodeEnum, RelEnum +from .graph import SimpleGraph +from .pipe import KnowledgeGraph + + +###################################################################### +## class definitions + +@dataclass(order=False, frozen=True) +class NodeStyle: # pylint: disable=R0902 + """ +Dataclass used for styling PyVis nodes. + """ + label: NodeEnum + shape: str + color: str + +NODE_STYLES: typing.List[ NodeStyle ] = [ + NodeStyle( + label = NodeEnum.DEP, + shape = "star", + color = "hsla(72, 19%, 90%, 0.4)", + ), + NodeStyle( + label = NodeEnum.LEM, + shape = "square", + color = "hsl(306, 45%, 57%)", + ), + NodeStyle( + label = NodeEnum.ENT, + shape = "circle", + color = "hsl(65, 46%, 58%)", + ), + NodeStyle( + label = NodeEnum.CHU, + shape = "triangle", + color = "hsla(72, 19%, 90%, 0.9)", + ), + NodeStyle( + label = NodeEnum.IRI, + shape = "diamond", + color = "hsla(55, 17%, 49%, 0.5)", + ), +] + +# shapes: image, circularImage, diamond, dot, star, triangle, triangleDown, square, icon + + +class RenderPyVis: # pylint: disable=R0903 + """ +Render the _lemma graph_ as a `PyVis` network. + """ + HTML_HEIGHT_WITH_CONTROLS: int = 1200 + + def __init__ ( + self, + graph: SimpleGraph, + kg: KnowledgeGraph, # pylint: disable=C0103 + ) -> None: + """ +Constructor. + + graph: +source graph to be visualized + + kg: +knowledge graph used for entity linking + """ + self.graph: SimpleGraph = graph + self.kg: KnowledgeGraph = kg # pylint: disable=C0103 + + + def render_lemma_graph ( # pylint: disable=R0912 + self, + *, + debug: bool = True, + ) -> pyvis.network.Network: + """ +Prepare the structure of the `NetworkX` graph to use for building +and returning a `PyVis` network to render. + +Make sure to call beforehand: `TextGraphs.calc_phrase_ranks()` + + debug: +debugging flag + + returns: + typing.Dict[ int, int ]: + """ +Cluster the communities in the _lemma graph_, then draw a +`NetworkX` graph of the notes with a specific color for each +community. + +Make sure to call beforehand: `TextGraphs.calc_phrase_ranks()` + + spring_distance: +`NetworkX` parameter used to separate clusters visually + + debug: +debugging flag + + returns: +a map of the calculated communities + """ + # cluster the communities, using girvan-newman + comm_iter: typing.Generator = nx.community.girvan_newman( + self.graph.lemma_graph, + ) + + _ = next(comm_iter) + next_level = next(comm_iter) + communities: list = sorted(map(sorted, next_level)) + + if debug: + ic(communities) + + comm_map: typing.Dict[ int, int ] = { + node_id: i + for i, comm in enumerate(communities) + for node_id in comm + } + + # map from community => color + xkcd_colors: typing.List[ str ] = list(mcolors.XKCD_COLORS.values()) + + colors: typing.List[ str ] = [ + xkcd_colors[comm_map[n]] + for n in list(self.graph.lemma_graph.nodes()) + ] + + # prep the labels + labels: typing.Dict[ int, str ] = { + node.node_id: self.kg.normalize_prefix(node.get_name()) + for node in self.graph.nodes.values() + } + + # ¡dibuja, hombre! + nx.draw_networkx( + self.graph.lemma_graph, + pos = nx.spring_layout( + self.graph.lemma_graph, + k = spring_distance / len(communities), + ), + labels = labels, + node_color = colors, + edge_color = "#bbb", + with_labels = True, + font_size = 9, + ) + + return comm_map + + + def generate_wordcloud ( + self, + *, + background: str = "black", + ) -> wordcloud.WordCloud: + """ +Generate a tag cloud from the given phrases. + +Make sure to call beforehand: `TextGraphs.calc_phrase_ranks()` + + background: +background color for the rendering + + returns: +the rendering as a `wordcloud.WordCloud` object, which can be used to generate PNG images, etc. + """ + terms: dict = {} + max_weight: float = 0.0 + + for node in self.graph.nodes.values(): + if node.weight > 0.0: + phrase: str = node.text.replace(" ", "_") + max_weight = max(max_weight, node.weight) + terms[phrase] = node.weight + + freq: dict = { + phrase: round(weight / max_weight * 1000.0) + for phrase, weight in terms.items() + } + + cloud: wordcloud.WordCloud = wordcloud.WordCloud( + background_color = background, + ) + + return cloud.generate_from_frequencies(freq)