Paco Nathan commited on
Commit
91eaff6
·
0 Parent(s):

A new start

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
.gitattributes ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.graffle filter=lfs diff=lfs merge=lfs -text
37
+ docs/assets/textgraphs.graffle filter=lfs diff=lfs merge=lfs -text
.github/FUNDING.yml ADDED
@@ -0,0 +1 @@
 
 
1
+ github: ceteri
.github/dependabot.yml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # Please see the documentation for all configuration options:
2
+ # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
3
+
4
+ version: 2
5
+ updates:
6
+ - package-ecosystem: "pip"
7
+ directory: "/"
8
+ schedule:
9
+ interval: "daily"
.github/workflows/ci.yml ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: CI
2
+
3
+ on: [pull_request, workflow_dispatch]
4
+
5
+ jobs:
6
+ # pre-commit:
7
+ # name: Run pre-commit
8
+ # runs-on: ubuntu-latest
9
+ # steps:
10
+ # - uses: actions/checkout@v3
11
+ # - uses: actions/setup-python@v3
12
+ # - uses: pre-commit/action@v3.0.0
13
+
14
+ test:
15
+ name: Tests for Python ${{ matrix.python-version }}
16
+ runs-on: ubuntu-latest
17
+ strategy:
18
+ matrix:
19
+ python-version: ['3.10']
20
+ fail-fast: false
21
+ # needs: pre-commit
22
+
23
+ steps:
24
+ - uses: actions/checkout@v3
25
+
26
+ - name: Set up Python
27
+ uses: actions/setup-python@v3
28
+ with:
29
+ python-version: ${{ matrix.python-version }}
30
+
31
+ - name: Install dependencies
32
+ run: |
33
+ pip install -e .
34
+ pip install -r requirements-dev.txt
35
+
36
+ - name: Run tests
37
+ run: |
38
+ pytest
.gitignore ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # local files
2
+ *~
3
+ chromedriver
4
+ lemma.json
5
+ lemma.ttl
6
+ lemma.zip
7
+ lemma_graph.zip
8
+ examples/tmp.*.html
9
+ vis.html
10
+ gor.html
11
+ txg.tgz
12
+ s2v_old/
13
+
14
+ # Byte-compiled / optimized / DLL files
15
+ __pycache__/
16
+ *.py[cod]
17
+ *$py.class
18
+
19
+ # C extensions
20
+ *.so
21
+
22
+ # Distribution / packaging
23
+ .Python
24
+ build/
25
+ develop-eggs/
26
+ dist/
27
+ downloads/
28
+ eggs/
29
+ .eggs/
30
+ lib/
31
+ lib64/
32
+ parts/
33
+ sdist/
34
+ var/
35
+ wheels/
36
+ share/python-wheels/
37
+ *.egg-info/
38
+ .installed.cfg
39
+ *.egg
40
+ MANIFEST
41
+
42
+ # PyInstaller
43
+ # Usually these files are written by a python script from a template
44
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
45
+ *.manifest
46
+ *.spec
47
+
48
+ # Installer logs
49
+ pip-log.txt
50
+ pip-delete-this-directory.txt
51
+
52
+ # Unit test / coverage reports
53
+ htmlcov/
54
+ .tox/
55
+ .nox/
56
+ .coverage
57
+ .coverage.*
58
+ .cache
59
+ nosetests.xml
60
+ coverage.xml
61
+ *.cover
62
+ *.py,cover
63
+ .hypothesis/
64
+ .pytest_cache/
65
+ cover/
66
+
67
+ # Translations
68
+ *.mo
69
+ *.pot
70
+
71
+ # Django stuff:
72
+ *.log
73
+ local_settings.py
74
+ db.sqlite3
75
+ db.sqlite3-journal
76
+
77
+ # Flask stuff:
78
+ instance/
79
+ .webassets-cache
80
+
81
+ # Scrapy stuff:
82
+ .scrapy
83
+
84
+ # Sphinx documentation
85
+ docs/_build/
86
+
87
+ # PyBuilder
88
+ .pybuilder/
89
+ target/
90
+
91
+ # Jupyter Notebook
92
+ .ipynb_checkpoints
93
+
94
+ # IPython
95
+ profile_default/
96
+ ipython_config.py
97
+
98
+ # pyenv
99
+ # For a library or package, you might want to ignore these files since the code is
100
+ # intended to run in multiple environments; otherwise, check them in:
101
+ # .python-version
102
+
103
+ # pipenv
104
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
105
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
106
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
107
+ # install all needed dependencies.
108
+ #Pipfile.lock
109
+
110
+ # poetry
111
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
112
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
113
+ # commonly ignored for libraries.
114
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
115
+ #poetry.lock
116
+
117
+ # pdm
118
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
119
+ #pdm.lock
120
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
121
+ # in version control.
122
+ # https://pdm.fming.dev/#use-with-ide
123
+ .pdm.toml
124
+
125
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
126
+ __pypackages__/
127
+
128
+ # Celery stuff
129
+ celerybeat-schedule
130
+ celerybeat.pid
131
+
132
+ # SageMath parsed files
133
+ *.sage.py
134
+
135
+ # Environments
136
+ .env
137
+ .venv
138
+ env/
139
+ venv/
140
+ ENV/
141
+ env.bak/
142
+ venv.bak/
143
+
144
+ # Spyder project settings
145
+ .spyderproject
146
+ .spyproject
147
+
148
+ # Rope project settings
149
+ .ropeproject
150
+
151
+ # mkdocs documentation
152
+ /site
153
+
154
+ # mypy
155
+ .mypy_cache/
156
+ .dmypy.json
157
+ dmypy.json
158
+
159
+ # Pyre type checker
160
+ .pyre/
161
+
162
+ # pytype static type analyzer
163
+ .pytype/
164
+
165
+ # Cython debug symbols
166
+ cython_debug/
167
+
168
+ # PyCharm
169
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
170
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
171
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
172
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
173
+ #.idea/
.pre-commit-config.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # See https://pre-commit.com for more information
2
+ # See https://pre-commit.com/hooks.html for more hooks
3
+ default_stages: [commit, push]
4
+ default_language_version:
5
+ python: python3
6
+ exclude: "deprecated"
7
+ repos:
8
+ - repo: https://github.com/pre-commit/pre-commit-hooks
9
+ rev: v4.4.0
10
+ hooks:
11
+ - id: trailing-whitespace
12
+ exclude: ^docs/
13
+ - id: check-builtin-literals
14
+ - id: check-executables-have-shebangs
15
+ - id: check-merge-conflict
16
+ - id: check-json
17
+ - id: check-yaml
18
+ - id: debug-statements
19
+ - id: detect-private-key
20
+ - repo: https://github.com/pre-commit/mirrors-mypy
21
+ rev: v1.4.1
22
+ hooks:
23
+ - id: mypy # type annotations
24
+ exclude: ^tests/,^venv/
25
+ - repo: https://github.com/PyCQA/pylint
26
+ rev: v2.17.4
27
+ hooks:
28
+ - id: pylint
29
+ exclude: error.py
30
+ - repo: https://github.com/codespell-project/codespell
31
+ rev: v2.2.4
32
+ hooks:
33
+ - id: codespell # spell-check source code
34
+ args: ["-L", "basf,textgraph,udo"] # comma separated stop words
35
+ exclude: ^README.md|^NOTES.md|^examples|^docs/ack.md|^docs/biblio.md
36
+ language: python
37
+ types: [text]
CITATION ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ @software{TextGraphs,
2
+ author = {Paco Nathan},
3
+ title = {{TextGraphs + LLMs + graph ML for entity extraction, linking, ranking, and constructing a lemma graph}},
4
+ year = 2023,
5
+ publisher = {Derwen},
6
+ doi = {10.5281/zenodo.10431783},
7
+ url = {https://github.com/DerwenAI/textgraphs}
8
+ }
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023-2024 Derwen, Inc.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
MANIFEST.in ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ include LICENSE
2
+ include README.md
3
+ include pyproject.toml
4
+ include requirements.txt
5
+ include setup.py
6
+ include tests/*.py
7
+ include textgraphs/*.py
8
+ prune .ipynb_checkpoints
9
+ prune docs
10
+ prune venv
NOTES.md ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TODO:
2
+
3
+ * can we build a causal graph of the provenance?
4
+ - https://www.pywhy.org/dowhy/v0.11.1/
5
+
6
+ * target publications:
7
+ - https://drops.dagstuhl.de/entities/issue/TGDK-volume-1-issue-1
8
+
9
+ * impl a _semantic random walk_ from a source KG
10
+
11
+ * link entities for lemmas, noun chunks using MediaWiki lookups?
12
+ - apply default semantics: `skos:related`
13
+
14
+ * eval clustering/community detection for GOR?
15
+ - https://github.com/MengLiuPurdue/LocalGraphClustering
16
+
17
+ * RAG example
18
+ - https://docs.llamaindex.ai/en/latest/examples/index_structs/knowledge_graph/KuzuGraphDemo.html#query-with-embeddings
19
+
20
+ * extend GOR to replicate NodePiece/ULTRA ?
21
+
22
+ * reify GOR, then use FastRP to generate embeddings?
23
+ - https://github.com/Knorreman/fastRP
24
+
25
+ * eval community detection to condense nodes using k-medoids?
26
+ - https://medium.com/neo4j/clustering-graph-data-with-k-medoids-3b6a67ea0873
27
+
28
+ * add conda packaging
29
+ - https://conda.github.io/grayskull/
30
+
31
+
32
+ * SPARQL the DBPedia/Wikidata equivs
33
+
34
+ * other NER/RE:
35
+ - https://github.com/dwadden/dygiepp?tab=readme-ov-file#pretrained-models
36
+
37
+ * check out https://github.com/wikipedia2vec/wikipedia2vec
38
+
39
+ * link `sense2vec` synonyms; make affordances for UI to annotate synonyms
PROMPT.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ https://medium.com/@nizami_muhammad/extracting-relation-from-sentence-using-llm-597d0c0310a8
2
+
3
+ Sentence: Werner Herzog is the son of Dietrich Herzog
4
+ Extract RDF predicate from the sentence in this format:
5
+ subject:<subject>
6
+ predicate:<predicate>
7
+ object:<object, optional>
8
+
9
+ ---
10
+
11
+ Sentence: Werner Herzog is a remarkable filmmaker and an intellectual originally from Germany, the son of Dietrich Herzog. After the war, Werner fled to America to become famous. Instead he became President and decided to nuke Slovenia.
12
+ Be brief, extract the top RDF predicate in DBPedia for the relation between <http://dbpedia.org/resource/Werner_Herzog><http://dbpedia.org/resource/Germany> in this format:
13
+ subject:<subject>
14
+ predicate:<predicate>
15
+ object:<object, optional>
README.md ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: TextGraphs
3
+ emoji: ✴
4
+ colorFrom: green
5
+ colorTo: gray
6
+ sdk: streamlit
7
+ sdk_version: 1.28.2
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+
14
+ # TextGraphs
15
+
16
+ [![DOI](https://zenodo.org/badge/735568863.svg)](https://zenodo.org/doi/10.5281/zenodo.10431783)
17
+ ![Licence](https://img.shields.io/github/license/DerwenAI/textgraphs)
18
+ [![Checked with mypy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/)
19
+ ![CI](https://github.com/DerwenAI/textgraphs/workflows/CI/badge.svg)
20
+ <br/>
21
+ ![Repo size](https://img.shields.io/github/repo-size/DerwenAI/textgraphs)
22
+ ![downloads](https://img.shields.io/pypi/dm/textgraphs)
23
+ ![sponsor](https://img.shields.io/github/sponsors/ceteri)
24
+
25
+ <img
26
+ alt="TextGraphs logo"
27
+ src="https://raw.githubusercontent.com/DerwenAI/textgraphs/main/docs/assets/logo.png"
28
+ width="231"
29
+ />
30
+
31
+
32
+ ## project info
33
+
34
+ Project home: <https://huggingface.co/spaces/DerwenAI/textgraphs>
35
+
36
+ Full documentation: <https://derwen.ai/docs/txg/>
37
+
38
+ Sample code is provided in `demo.py`
39
+
40
+
41
+ ## requirements
42
+
43
+ * Python 3.10+
44
+
45
+
46
+ ## deploy library from PyPi
47
+
48
+ Prepare the virtual environment:
49
+
50
+ ```bash
51
+ python3 -m venv venv
52
+ source venv/bin/activate
53
+ python3 -m pip install -U pip wheel setuptools
54
+ ```
55
+
56
+ Install from [PyPi](https://pypi.python.org/pypi/textgraphs):
57
+
58
+ ```bash
59
+ python3 -m pip install -U textgraphs
60
+ ```
61
+
62
+
63
+ ## run demos locally
64
+
65
+ ```bash
66
+ python3 demo.py
67
+ ```
68
+
69
+ ```bash
70
+ streamlit run app.py
71
+ ```
72
+
73
+
74
+ ## install library from source locally
75
+
76
+ ```bash
77
+ python3 -m venv venv
78
+ source venv/bin/activate
79
+
80
+ python3 -m pip install -U pip wheel setuptools
81
+ python3 -m pip install -e .
82
+ ```
83
+
84
+ To run the Streamlit or JupyterLab demos, also install:
85
+
86
+ ```bash
87
+ python3 -m pip install -r requirements-dev.txt
88
+ ```
89
+
90
+
91
+ ## license and copyright
92
+
93
+ Source code for **TextGraphs** plus its logo, documentation, and
94
+ examples have an [MIT license](https://spdx.org/licenses/MIT.html)
95
+ which is succinct and simplifies use in commercial applications.
96
+
97
+ All materials herein are Copyright &copy; 2023-2024 Derwen, Inc.
98
+
99
+
100
+ ## attribution
101
+
102
+ Please use the following BibTeX entry for citing **TextGraphs** if you
103
+ use it in your research or software:
104
+ ```bibtex
105
+ @software{TextGraphs,
106
+ author = {Paco Nathan},
107
+ title = {{TextGraphs + LLMs + graph ML for entity extraction, linking, ranking, and constructing a lemma graph}},
108
+ year = 2023,
109
+ publisher = {Derwen},
110
+ doi = {10.5281/zenodo.10431783},
111
+ url = {https://github.com/DerwenAI/textgraphs}
112
+ }
113
+ ```
114
+
115
+
116
+ ## star history
117
+
118
+ [![Star History Chart](https://api.star-history.com/svg?repos=derwenai/textgraphs&type=Date)](https://star-history.com/#derwenai/textgraphs&Date)
SECURITY.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Security Policy
2
+
3
+ ## Supported Versions
4
+
5
+ Versions which are currently being supported with security updates:
6
+
7
+ | Version | Supported |
8
+ | ------- | ------------------ |
9
+ | > 0.2 | :white_check_mark: |
10
+
11
+ ## Reporting a Vulnerability
12
+
13
+ To report a vulnerability, please create a new [*issue*](https://github.com/DerwenAI/textgraphs/issues).
14
+ We will be notified immediately, and will attempt to respond on the reported issue immediately.
app.py ADDED
@@ -0,0 +1,459 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ # pylint: disable=C0301
4
+
5
+ """
6
+ HuggingFace Spaces demo of the `TextGraphs` library using Streamlit
7
+
8
+ see copyright/license https://huggingface.co/spaces/DerwenAI/textgraphs/blob/main/README.md
9
+ """
10
+
11
+ import pathlib
12
+ import time
13
+ import typing
14
+
15
+ import matplotlib.pyplot as plt # pylint: disable=E0401
16
+ import pandas as pd # pylint: disable=E0401
17
+ import pyvis # pylint: disable=E0401
18
+ import spacy # pylint: disable=E0401
19
+ import streamlit as st # pylint: disable=E0401
20
+
21
+ import textgraphs
22
+
23
+
24
+ if __name__ == "__main__":
25
+ # default text input
26
+ SRC_TEXT: str = """
27
+ Werner Herzog is a remarkable filmmaker and intellectual originally from Germany, the son of Dietrich Herzog.
28
+ """
29
+
30
+ # store the initial value of widgets in session state
31
+ if "visibility" not in st.session_state:
32
+ st.session_state.visibility = "visible"
33
+ st.session_state.disabled = False
34
+
35
+ with st.container():
36
+ st.title("demo: TextGraphs + LLMs to construct a 'lemma graph'")
37
+ st.markdown(
38
+ """
39
+ docs: <https://derwen.ai/docs/txg/>
40
+ &nbsp; &nbsp;
41
+ DOI: 10.5281/zenodo.10431783
42
+ """,
43
+ unsafe_allow_html = True,
44
+ )
45
+
46
+
47
+ # collect input + config
48
+ st.subheader("configure", divider = "rainbow")
49
+
50
+ text_input: str = st.text_area(
51
+ "Source Text:",
52
+ value = SRC_TEXT.strip(),
53
+ )
54
+
55
+ llm_ner = st.checkbox(
56
+ "enhance spaCy NER using: SpanMarker",
57
+ value = False,
58
+ )
59
+
60
+ link_ents = st.checkbox(
61
+ "link entities using: DBPedia Spotlight, WikiMedia API",
62
+ value = False,
63
+ )
64
+
65
+ infer_rel = st.checkbox(
66
+ "infer relations using: REBEL, OpenNRE, qwikidata",
67
+ value = False,
68
+ )
69
+
70
+ if text_input or llm_ner or link_ents or infer_rel:
71
+ ## parse the document
72
+ st.subheader("parse the raw text", divider = "rainbow")
73
+ start_time: float = time.time()
74
+
75
+ # generally it is fine to use factory defaults,
76
+ # although let's illustrate these settings here
77
+ infer_rels: list = []
78
+
79
+ if infer_rel:
80
+ with st.spinner(text = "load rel models..."):
81
+ infer_rels = [
82
+ textgraphs.InferRel_OpenNRE(
83
+ model = textgraphs.OPENNRE_MODEL,
84
+ max_skip = textgraphs.MAX_SKIP,
85
+ min_prob = textgraphs.OPENNRE_MIN_PROB,
86
+ ),
87
+ textgraphs.InferRel_Rebel(
88
+ lang = "en_XX",
89
+ mrebel_model = textgraphs.MREBEL_MODEL,
90
+ ),
91
+ ]
92
+
93
+ ner: typing.Optional[ textgraphs.Component ] = None
94
+
95
+ if llm_ner:
96
+ ner = textgraphs.NERSpanMarker(
97
+ ner_model = textgraphs.NER_MODEL,
98
+ )
99
+
100
+ tg: textgraphs.TextGraphs = textgraphs.TextGraphs(
101
+ factory = textgraphs.PipelineFactory(
102
+ spacy_model = textgraphs.SPACY_MODEL,
103
+ ner = ner,
104
+ kg = textgraphs.KGWikiMedia(
105
+ spotlight_api = textgraphs.DBPEDIA_SPOTLIGHT_API,
106
+ dbpedia_search_api = textgraphs.DBPEDIA_SEARCH_API,
107
+ dbpedia_sparql_api = textgraphs.DBPEDIA_SPARQL_API,
108
+ wikidata_api = textgraphs.WIKIDATA_API,
109
+ min_alias = textgraphs.DBPEDIA_MIN_ALIAS,
110
+ min_similarity = textgraphs.DBPEDIA_MIN_SIM,
111
+ ),
112
+ infer_rels = infer_rels,
113
+ ),
114
+ )
115
+
116
+ duration: float = round(time.time() - start_time, 3)
117
+ st.write(f"set up: {round(duration, 3)} sec")
118
+
119
+ with st.spinner(text = "parse text..."):
120
+ start_time = time.time()
121
+
122
+ pipe: textgraphs.Pipeline = tg.create_pipeline(
123
+ text_input.strip(),
124
+ )
125
+
126
+ duration = round(time.time() - start_time, 3)
127
+ st.write(f"parse text: {round(duration, 3)} sec, {len(text_input)} characters")
128
+
129
+ # render the entity html
130
+ ent_html: str = spacy.displacy.render(
131
+ pipe.ner_doc,
132
+ style = "ent",
133
+ jupyter = False,
134
+ )
135
+
136
+ st.markdown(
137
+ ent_html,
138
+ unsafe_allow_html = True,
139
+ )
140
+
141
+ # generate dependencies as an SVG
142
+ dep_svg = spacy.displacy.render(
143
+ pipe.ner_doc,
144
+ style = "dep",
145
+ jupyter = False,
146
+ )
147
+
148
+ st.image(
149
+ dep_svg,
150
+ width = 800,
151
+ use_column_width = "never",
152
+ )
153
+
154
+
155
+ ## collect graph elements from the parse
156
+ st.subheader("construct the base level of the lemma graph", divider = "rainbow")
157
+ start_time = time.time()
158
+
159
+ tg.collect_graph_elements(
160
+ pipe,
161
+ debug = False,
162
+ )
163
+
164
+ duration = round(time.time() - start_time, 3)
165
+ st.write(f"collect elements: {round(duration, 3)} sec, {len(tg.nodes)} nodes, {len(tg.edges)} edges")
166
+
167
+ ## perform entity linking
168
+ if link_ents:
169
+ st.subheader("extract entities and perform entity linking", divider = "rainbow")
170
+
171
+ with st.spinner(text = "entity linking..."):
172
+ start_time = time.time()
173
+
174
+ tg.perform_entity_linking(
175
+ pipe,
176
+ debug = False,
177
+ )
178
+
179
+ duration = round(time.time() - start_time, 3)
180
+ st.write(f"entity linking: {round(duration, 3)} sec")
181
+
182
+
183
+ ## perform relation extraction
184
+ if infer_rel:
185
+ st.subheader("infer relations", divider = "rainbow")
186
+ st.write("NB: this part runs an order of magnitude more *slooooooowly* on HF Spaces")
187
+
188
+ with st.spinner(text = "relation extraction..."):
189
+ start_time = time.time()
190
+
191
+ # NB: run this iteratively since Streamlit on HF Spaces is *sloooooooooow*
192
+ inferred_edges: list = tg.infer_relations(
193
+ pipe,
194
+ debug = False,
195
+ )
196
+
197
+ duration = round(time.time() - start_time, 3)
198
+
199
+ n_list: list = list(tg.nodes.values())
200
+
201
+ df_rel: pd.DataFrame = pd.DataFrame.from_dict([
202
+ {
203
+ "src": n_list[edge.src_node].text,
204
+ "dst": n_list[edge.dst_node].text,
205
+ "rel": edge.rel,
206
+ "weight": edge.prob,
207
+ }
208
+ for edge in inferred_edges
209
+ ])
210
+
211
+ st.dataframe(df_rel)
212
+ st.write(f"relation extraction: {round(duration, 3)} sec, {len(df_rel)} edges")
213
+
214
+
215
+ ## construct the _lemma graph_
216
+ start_time = time.time()
217
+
218
+ tg.construct_lemma_graph(
219
+ debug = False,
220
+ )
221
+
222
+ duration = round(time.time() - start_time, 3)
223
+ st.write(f"construct graph: {round(duration, 3)} sec")
224
+
225
+
226
+ ## rank the extracted phrases
227
+ st.subheader("rank the extracted phrases", divider = "rainbow")
228
+ start_time = time.time()
229
+
230
+ tg.calc_phrase_ranks(
231
+ pr_alpha = textgraphs.PAGERANK_ALPHA,
232
+ debug = False,
233
+ )
234
+
235
+ df_ent: pd.DataFrame = tg.get_phrases_as_df()
236
+
237
+ duration = round(time.time() - start_time, 3)
238
+ st.write(f"extract: {round(duration, 3)} sec, {len(df_ent)} entities")
239
+
240
+ st.dataframe(df_ent)
241
+
242
+
243
+ ## generate a word cloud
244
+ st.subheader("generate a word cloud", divider = "rainbow")
245
+
246
+ render: textgraphs.RenderPyVis = tg.create_render()
247
+ wordcloud = render.generate_wordcloud()
248
+
249
+ st.image(
250
+ wordcloud.to_image(),
251
+ width = 700,
252
+ use_column_width = "never",
253
+ )
254
+
255
+
256
+ ## visualize the lemma graph
257
+ st.subheader("visualize the lemma graph", divider = "rainbow")
258
+ st.markdown(
259
+ """
260
+ what you get at this stage is a relatively noisy,
261
+ low-level detailed graph of the parsed text
262
+
263
+ the most interesting nodes will probably be either
264
+ subjects (`nsubj`) or direct objects (`pobj`)
265
+ """
266
+ )
267
+
268
+ pv_graph: pyvis.network.Network = render.render_lemma_graph(
269
+ debug = False,
270
+ )
271
+
272
+ pv_graph.force_atlas_2based(
273
+ gravity = -38,
274
+ central_gravity = 0.01,
275
+ spring_length = 231,
276
+ spring_strength = 0.7,
277
+ damping = 0.8,
278
+ overlap = 0,
279
+ )
280
+
281
+ pv_graph.show_buttons(filter_ = [ "physics" ])
282
+ pv_graph.toggle_physics(True)
283
+
284
+ py_html: pathlib.Path = pathlib.Path("vis.html")
285
+ pv_graph.save_graph(py_html.as_posix())
286
+
287
+ st.components.v1.html(
288
+ py_html.read_text(encoding = "utf-8"),
289
+ height = render.HTML_HEIGHT_WITH_CONTROLS,
290
+ scrolling = False,
291
+ )
292
+
293
+
294
+ ## cluster the communities
295
+ st.subheader("cluster the communities", divider = "rainbow")
296
+ st.markdown(
297
+ """
298
+ <details>
299
+ <summary><strong>About this clustering...</strong></summary>
300
+ <p>
301
+ In the tutorial
302
+ <a href="https://towardsdatascience.com/how-to-convert-any-text-into-a-graph-of-concepts-110844f22a1a" target="_blank">"How to Convert Any Text Into a Graph of Concepts"</a>,
303
+ Rahul Nayak uses the
304
+ <a href="https://en.wikipedia.org/wiki/Girvan%E2%80%93Newman_algorithm"><em>girvan-newman</em></a>
305
+ algorithm to split the graph into communities, then clusters on those communities.
306
+ His approach works well for unsupervised clustering of key phrases which have been extracted from a collection of many documents.
307
+ </p>
308
+ <p>
309
+ While Nayak was working with entities extracted from "chunks" of text, not with a text graph per se, this approach is useful for identifying network motifs which can be condensed, e.g., to extract a semantic graph overlay as an <em>abstraction layer</em> atop a lemma graph.
310
+ </p>
311
+ </details>
312
+ <br/>
313
+ """,
314
+ unsafe_allow_html = True,
315
+ )
316
+
317
+ spring_dist_val = st.slider(
318
+ "spring distance for NetworkX clusters",
319
+ min_value = 0.0,
320
+ max_value = 10.0,
321
+ value = 1.2,
322
+ )
323
+
324
+ if spring_dist_val:
325
+ start_time = time.time()
326
+ fig, ax = plt.subplots()
327
+
328
+ comm_map: dict = render.draw_communities(
329
+ spring_distance = spring_dist_val,
330
+ )
331
+
332
+ st.pyplot(fig)
333
+
334
+ duration = round(time.time() - start_time, 3)
335
+ st.write(f"cluster: {round(duration, 3)} sec, {max(comm_map.values()) + 1} clusters")
336
+
337
+
338
+ ## transform a graph of relations
339
+ st.subheader("transform as a graph of relations", divider = "rainbow")
340
+ st.markdown(
341
+ """
342
+ Using the topological transform given in `lee2023ingram`, construct a
343
+ _graph of relations_ for enhancing graph inference.
344
+
345
+ <details>
346
+ <summary><strong>What does this transform provide?</strong></summary>
347
+ <p>
348
+ By using a <em>graph of relations</em> dual representation of our graph data, first and foremost we obtain a more compact representation of the relations in the graph, and means of making inferences (e.g., <em>link prediction</em>) where there is substantially more invariance in the training data.
349
+ </p>
350
+ <p>
351
+ Also recognize that for a parse graph of a paragraph in the English language, the most interesting nodes will probably be either subjects (<code>nsubj</code>) or direct objects (<code>pobj</code>). Here in the <em>graph of relations</em> we can see illustrated how the important details from <em>entity linking</em> tend to cluster near either <code>nsubj</code> or <code>pobj</code> entities, connected through punctuation. This aspect is not as readily observed in the earlier visualization of the <em>lemma graph</em>.
352
+ </p>
353
+ </details>
354
+ """,
355
+ unsafe_allow_html = True,
356
+ )
357
+
358
+ start_time = time.time()
359
+
360
+ gor: textgraphs.GraphOfRelations = textgraphs.GraphOfRelations(tg)
361
+ gor.seeds()
362
+ gor.construct_gor()
363
+
364
+ scores: typing.Dict[ tuple, float ] = gor.get_affinity_scores()
365
+ pv_graph = gor.render_gor_pyvis(scores)
366
+
367
+ pv_graph.force_atlas_2based(
368
+ gravity = -38,
369
+ central_gravity = 0.01,
370
+ spring_length = 231,
371
+ spring_strength = 0.7,
372
+ damping = 0.8,
373
+ overlap = 0,
374
+ )
375
+
376
+ pv_graph.show_buttons(filter_ = [ "physics" ])
377
+ pv_graph.toggle_physics(True)
378
+
379
+ py_html = pathlib.Path("gor.html")
380
+ pv_graph.save_graph(py_html.as_posix())
381
+
382
+ st.components.v1.html(
383
+ py_html.read_text(encoding = "utf-8"),
384
+ height = render.HTML_HEIGHT_WITH_CONTROLS,
385
+ scrolling = False,
386
+ )
387
+
388
+ duration = round(time.time() - start_time, 3)
389
+ st.write(f"transform: {round(duration, 3)} sec, {len(gor.rel_list)} relations")
390
+
391
+ ## download lemma graph
392
+ st.subheader("download the results", divider = "rainbow")
393
+ st.markdown(
394
+ """
395
+ Download a serialized <em>lemma graph</em> in multiple formats:
396
+ """,
397
+ unsafe_allow_html = True,
398
+ )
399
+
400
+ col1, col2, col3 = st.columns(3)
401
+
402
+ with col1:
403
+ st.download_button(
404
+ label = "download node-link",
405
+ data = tg.dump_lemma_graph(),
406
+ file_name = "lemma_graph.json",
407
+ mime = "application/json",
408
+ )
409
+
410
+ st.markdown(
411
+ """
412
+ <a href="https://networkx.org/documentation/stable/reference/readwrite/generated/networkx.readwrite.json_graph.node_link_data.html" target="_blank"><em>node-link</em></a>: JSON data suitable for import to <a href="https://neo4j.com/docs/getting-started/data-import/csv-import/" target="_blank"><em>Neo4j</em></a>, <a href="https://networkx.org/documentation/stable/reference/readwrite/generated/networkx.readwrite.json_graph.node_link_graph.html#networkx.readwrite.json_graph.node_link_graph" target="_blank"><em>NetworkX</em></a>, etc.
413
+ """,
414
+ unsafe_allow_html = True,
415
+ )
416
+
417
+ with col2:
418
+ st.download_button(
419
+ label = "download RDF",
420
+ data = tg.export_rdf(),
421
+ file_name = "lemma_graph.ttl",
422
+ mime = "text/turtle",
423
+ )
424
+
425
+ st.markdown(
426
+ """
427
+ <a href="https://www.w3.org/TR/turtle/" target="_blank"><em>Turtle/N3</em></a>: W3C semantic graph representation, based on RDF, OWL, SKOS, etc.
428
+ """,
429
+ unsafe_allow_html = True,
430
+ )
431
+
432
+ with col3:
433
+ st.download_button(
434
+ label = "download KùzuDB",
435
+ data = tg.export_kuzu(zip_name = "lemma_graph.zip"),
436
+ file_name = "lemma.zip",
437
+ mime = "application/x-zip-compressed",
438
+ )
439
+
440
+ st.markdown(
441
+ """
442
+ <a href="https://opencypher.org/" target="_blank"><em>openCypher</em></a>: ZIP file of a labeled property graph in <a href="https://kuzudb.com/" target="_blank"><em>KùzuDB</em></a>
443
+ """,
444
+ unsafe_allow_html = True,
445
+ )
446
+
447
+
448
+ ## WIP
449
+ st.divider()
450
+ st.write("(WIP)")
451
+
452
+ thanks: str = """
453
+ This demo has completed, and thank you for running a Derwen space!
454
+ """
455
+
456
+ st.toast(
457
+ thanks,
458
+ icon ="😍",
459
+ )
bin/nb_md.sh ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash -e -x
2
+
3
+ for notebook_path in examples/*.ipynb; do
4
+ [ -e "$notebook_path" ] || continue
5
+
6
+ notebook=`basename $notebook_path`
7
+ stem=`basename $notebook_path .ipynb`
8
+
9
+ cp $notebook_path docs/$notebook
10
+ jupyter nbconvert docs/$notebook --to markdown
11
+ #exit 0
12
+
13
+ python3 bin/vis_doc.py docs/"$stem".md
14
+ rm docs/$notebook
15
+ done
bin/preview.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ Preview the `MkDocs` build of the online documentation.
6
+ """
7
+
8
+ from pathlib import PurePosixPath
9
+ import os
10
+
11
+ from flask import Flask, redirect, send_from_directory, url_for # pylint: disable=E0401
12
+
13
+ DOCS_ROUTE = "/docs/"
14
+ DOCS_FILES = "../site"
15
+ DOCS_PORT = 8000
16
+
17
+ APP = Flask(__name__, static_folder=DOCS_FILES, template_folder=DOCS_FILES)
18
+
19
+ APP.config["DEBUG"] = False
20
+ APP.config["MAX_CONTENT_LENGTH"] = 52428800
21
+ APP.config["SECRET_KEY"] = "Technically, I remain uncommitted."
22
+ APP.config["SEND_FILE_MAX_AGE_DEFAULT"] = 3000
23
+
24
+
25
+ @APP.route(DOCS_ROUTE, methods=["GET"])
26
+ @APP.route(DOCS_ROUTE + "<path:path>", methods=["GET"], defaults={"path": None})
27
+ @APP.route(DOCS_ROUTE + "<path:path>", methods=["GET"])
28
+ def static_proxy (path=""):
29
+ """static route for an asset"""
30
+ if not path:
31
+ suffix = ""
32
+ else:
33
+ suffix = PurePosixPath(path).suffix
34
+
35
+ if suffix not in [".css", ".js", ".map", ".png", ".svg", ".xml"]:
36
+ path = os.path.join(path, "index.html")
37
+
38
+ return send_from_directory(DOCS_FILES, path)
39
+
40
+
41
+ @APP.route("/index.html")
42
+ @APP.route("/home/")
43
+ @APP.route("/")
44
+ def home_redirects ():
45
+ """redirect for home page"""
46
+ return redirect(url_for("static_proxy"))
47
+
48
+
49
+ if __name__ == "__main__":
50
+ APP.run(host="0.0.0.0", port=DOCS_PORT, debug=True)
bin/push_pypi.sh ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash -e -x
2
+
3
+ rm -rf dist build textgraphs.egg-info
4
+ python3 -m build
5
+ twine check dist/*
6
+
7
+ # this assumes the use of `~/.pypirc`
8
+ # https://packaging.python.org/en/latest/specifications/pypirc/
9
+
10
+ twine upload ./dist/* --verbose
bin/vis_doc.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ Convert the markdown generated from Jupyter notebooks to preserve
6
+ rendered images, etc.
7
+ """
8
+
9
+ import os
10
+ import pathlib
11
+ import re
12
+ import sys
13
+ import time
14
+ import traceback
15
+ import typing
16
+
17
+ from icecream import ic # pylint: disable=E0401
18
+ from selenium import webdriver # pylint: disable=E0401
19
+
20
+
21
+ class Converter:
22
+ """
23
+ HTML/Markdown conversion
24
+ """
25
+ PAT_HEADER = re.compile(r"^(```python\n\# for use.*production:\n.*\n```\n)", re.MULTILINE)
26
+ PAT_SOURCE = re.compile(r"\s+src\=\"(\S+)\"")
27
+ REPLACEMENT_HEADER: str = """
28
+ !!! note
29
+ To run this notebook in JupyterLab, load [`examples/{}.ipynb`]({}/examples/{}.ipynb)
30
+
31
+ """
32
+
33
+ def __init__ (
34
+ self,
35
+ src_url: str,
36
+ ) -> None:
37
+ """
38
+ Constructor.
39
+ """
40
+ self.src_url: str = src_url
41
+
42
+
43
+ def replace_sys_header (
44
+ self,
45
+ text: str,
46
+ stem: str,
47
+ *,
48
+ debug: bool = False,
49
+ ) -> str:
50
+ """
51
+ Replace the initial cell in a tutorial notebook.
52
+ """
53
+ output: typing.List[ str ] = []
54
+
55
+ for chunk in self.PAT_HEADER.split(text):
56
+ m_header: typing.Optional[ re.Match ] = self.PAT_HEADER.match(chunk)
57
+
58
+ if debug:
59
+ ic(m_header)
60
+
61
+ if m_header:
62
+ header: str = self.REPLACEMENT_HEADER.format(stem, self.src_url, stem)
63
+ output.append(header)
64
+ else:
65
+ output.append(chunk)
66
+
67
+ return "\n".join(output)
68
+
69
+
70
+ def get_pyvis_html (
71
+ self,
72
+ iframe: str,
73
+ *,
74
+ debug: bool = False,
75
+ ) -> str:
76
+ """
77
+ Locate the HTML files generated by `PyVis` if any.
78
+ This assumes the HTML files are named `tmp.fig*.*`
79
+ """
80
+ source_html: typing.Optional[ str ] = None
81
+ m_source: typing.Optional[ re.Match ] = self.PAT_SOURCE.search(iframe)
82
+
83
+ if m_source:
84
+ source_html = m_source.group(1)
85
+
86
+ if debug:
87
+ ic(source_html)
88
+
89
+ if "tmp.fig" not in source_html: # type: ignore
90
+ # <iframe/> wasn't generated by PyVis
91
+ source_html = None
92
+
93
+ return source_html # type: ignore
94
+
95
+
96
+ def render_screenshot (
97
+ self,
98
+ source_html: str,
99
+ source_png,
100
+ ) -> None:
101
+ """
102
+ use Selenium to render `source_png` from `source_html`
103
+ """
104
+ #chrome_path = os.getcwd() + "/chromedriver"
105
+ #chrome_options = Options()
106
+
107
+ browser: webdriver.Chrome = webdriver.Chrome()
108
+ browser.get(source_html)
109
+ time.sleep(10)
110
+
111
+ browser.get_screenshot_as_file(source_png)
112
+ browser.quit()
113
+
114
+
115
+ def replace_pyvis_iframe (
116
+ self,
117
+ text: str,
118
+ parent: pathlib.Path,
119
+ stem: str,
120
+ *,
121
+ debug: bool = False,
122
+ ) -> str:
123
+ """
124
+ Substitute static images for the rendered graphs.
125
+ """
126
+ output: typing.List[ str ] = []
127
+ in_iframe: bool = False
128
+
129
+ for line in text.split("\n"):
130
+ if line.startswith("<iframe"):
131
+ in_iframe = True
132
+
133
+ if not in_iframe:
134
+ output.append(line)
135
+ elif line.strip().startswith("src="):
136
+ src_html: str = self.get_pyvis_html(line)
137
+ src_png: str = src_html.replace(".html", ".png")
138
+
139
+ if debug:
140
+ ic(src_png)
141
+
142
+ try:
143
+ os.mkdir(f"{parent}/{stem}_files")
144
+ except: # pylint: disable=W0702
145
+ pass
146
+
147
+ self.render_screenshot(
148
+ f"file://{os.getcwd()}/examples/{src_html}",
149
+ f"{parent}/{stem}_files/{src_png}",
150
+ )
151
+
152
+ output.append(f"![png]({stem}_files/{src_png})")
153
+
154
+ if line.startswith("></iframe>"):
155
+ in_iframe = False
156
+
157
+ return "\n".join(output)
158
+
159
+
160
+ if __name__ == "__main__":
161
+ try:
162
+ conv: Converter = Converter(
163
+ "https://github.com/DerwenAI/textgraphs/blob/main",
164
+ )
165
+
166
+ filename: pathlib.Path = pathlib.Path(sys.argv[1])
167
+ _parent: pathlib.Path = filename.parent
168
+ _stem: str = filename.stem
169
+
170
+ ic(filename, _parent, _stem)
171
+
172
+ with open(filename, "r", encoding = "utf-8") as fp:
173
+ html: str = fp.read()
174
+
175
+ html = conv.replace_sys_header( # pylint: disable=C0103
176
+ html,
177
+ _stem,
178
+ debug = False, # True
179
+ )
180
+
181
+ #print(text)
182
+ #sys.exit(0)
183
+
184
+ html = conv.replace_pyvis_iframe( # pylint: disable=C0103
185
+ html,
186
+ _parent,
187
+ _stem,
188
+ debug = True, # False
189
+ )
190
+
191
+ with open(filename, "w", encoding = "utf-8") as fp:
192
+ fp.write(html)
193
+
194
+ except Exception as ex: # pylint: disable=W0718
195
+ ic(ex)
196
+ traceback.print_exc()
demo.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ Sample application to demo the `TextGraphs` library.
6
+
7
+ see copyright/license https://huggingface.co/spaces/DerwenAI/textgraphs/blob/main/README.md
8
+ """
9
+
10
+ import asyncio
11
+ import sys # pylint: disable=W0611
12
+ import traceback
13
+ import time
14
+ import typing
15
+
16
+ from icecream import ic # pylint: disable=E0401
17
+ from pyinstrument import Profiler # pylint: disable=E0401
18
+ import matplotlib.pyplot as plt # pylint: disable=E0401
19
+ import pandas as pd # pylint: disable=E0401
20
+
21
+ import textgraphs
22
+
23
+
24
+ if __name__ == "__main__":
25
+ SRC_TEXT: str = """
26
+ Werner Herzog is a remarkable filmmaker and an intellectual originally from Germany, the son of Dietrich Herzog.
27
+ After the war, Werner fled to America to become famous.
28
+ """
29
+
30
+ ## set up
31
+ ## NB: profiler raises handler exceptions when `concur = False`
32
+ debug: bool = False # True
33
+ concur: bool = True # False
34
+ profile: bool = True # False
35
+
36
+ if profile:
37
+ profiler: Profiler = Profiler()
38
+ profiler.start()
39
+
40
+ try:
41
+ start_time: float = time.time()
42
+
43
+ tg: textgraphs.TextGraphs = textgraphs.TextGraphs(
44
+ factory = textgraphs.PipelineFactory(
45
+ spacy_model = textgraphs.SPACY_MODEL,
46
+ ner = None, #textgraphs.NERSpanMarker(),
47
+ kg = textgraphs.KGWikiMedia(
48
+ spotlight_api = textgraphs.DBPEDIA_SPOTLIGHT_API,
49
+ dbpedia_search_api = textgraphs.DBPEDIA_SEARCH_API,
50
+ dbpedia_sparql_api = textgraphs.DBPEDIA_SPARQL_API,
51
+ wikidata_api = textgraphs.WIKIDATA_API,
52
+ ),
53
+ infer_rels = [
54
+ textgraphs.InferRel_OpenNRE(
55
+ model = textgraphs.OPENNRE_MODEL,
56
+ max_skip = textgraphs.MAX_SKIP,
57
+ min_prob = textgraphs.OPENNRE_MIN_PROB,
58
+ ),
59
+ textgraphs.InferRel_Rebel(
60
+ lang = "en_XX",
61
+ mrebel_model = textgraphs.MREBEL_MODEL,
62
+ ),
63
+ ],
64
+ ),
65
+ )
66
+
67
+ duration: float = round(time.time() - start_time, 3)
68
+ print(f"{duration:7.3f} sec: set up")
69
+
70
+
71
+ ## NLP parse
72
+ start_time = time.time()
73
+
74
+ pipe: textgraphs.Pipeline = tg.create_pipeline(
75
+ SRC_TEXT.strip(),
76
+ )
77
+
78
+ duration = round(time.time() - start_time, 3)
79
+ print(f"{duration:7.3f} sec: parse text")
80
+
81
+
82
+ ## collect graph elements from the parse
83
+ start_time = time.time()
84
+
85
+ tg.collect_graph_elements(
86
+ pipe,
87
+ debug = debug,
88
+ )
89
+
90
+ duration = round(time.time() - start_time, 3)
91
+ print(f"{duration:7.3f} sec: collect elements")
92
+
93
+
94
+ ## perform entity linking
95
+ start_time = time.time()
96
+
97
+ tg.perform_entity_linking(
98
+ pipe,
99
+ debug = debug,
100
+ )
101
+
102
+ duration = round(time.time() - start_time, 3)
103
+ print(f"{duration:7.3f} sec: entity linking")
104
+
105
+
106
+ ## perform concurrent relation extraction
107
+ start_time = time.time()
108
+
109
+ if concur:
110
+ try:
111
+ loop = asyncio.get_running_loop()
112
+ except RuntimeError:
113
+ loop = asyncio.new_event_loop()
114
+ asyncio.set_event_loop(loop)
115
+
116
+ inferred_edges: list = loop.run_until_complete(
117
+ tg.infer_relations_async(
118
+ pipe,
119
+ debug = debug,
120
+ )
121
+ )
122
+ else:
123
+ inferred_edges = tg.infer_relations(
124
+ pipe,
125
+ debug = debug,
126
+ )
127
+
128
+ duration = round(time.time() - start_time, 3)
129
+ print(f"{duration:7.3f} sec: relation extraction")
130
+
131
+ n_list: list = list(tg.nodes.values())
132
+
133
+ df_rel: pd.DataFrame = pd.DataFrame.from_dict([
134
+ {
135
+ "src": n_list[edge.src_node].text,
136
+ "dst": n_list[edge.dst_node].text,
137
+ "rel": pipe.kg.normalize_prefix(edge.rel),
138
+ "weight": edge.prob,
139
+ }
140
+ for edge in inferred_edges
141
+ ])
142
+
143
+ ic(df_rel)
144
+
145
+
146
+ ## construct the _lemma graph_
147
+ start_time = time.time()
148
+
149
+ tg.construct_lemma_graph(
150
+ debug = debug,
151
+ )
152
+
153
+ duration = round(time.time() - start_time, 3)
154
+ print(f"{duration:7.3f} sec: construct graph")
155
+
156
+
157
+ ## rank the extracted phrases
158
+ start_time = time.time()
159
+
160
+ tg.calc_phrase_ranks(
161
+ pr_alpha = textgraphs.PAGERANK_ALPHA,
162
+ debug = debug,
163
+ )
164
+
165
+ duration = round(time.time() - start_time, 3)
166
+ print(f"{duration:7.3f} sec: rank phrases")
167
+
168
+
169
+ ## show the extracted phrase results
170
+ ic(tg.get_phrases_as_df())
171
+
172
+ if debug: # pylint: disable=W0101
173
+ for key, node in tg.nodes.items():
174
+ print(key, node)
175
+
176
+ for key, edge in tg.edges.items():
177
+ print(key, edge)
178
+
179
+ except Exception as ex: # pylint: disable=W0718
180
+ ic(ex)
181
+ traceback.print_exc()
182
+
183
+
184
+ ## transform graph data to a _graph of relations_
185
+ start_time = time.time()
186
+
187
+ gor: textgraphs.GraphOfRelations = textgraphs.GraphOfRelations(
188
+ tg,
189
+ )
190
+
191
+ gor.seeds(
192
+ debug = False, # True
193
+ )
194
+
195
+ gor.construct_gor(
196
+ debug = False, # True
197
+ )
198
+
199
+ _scores: typing.Dict[ tuple, float ] = gor.get_affinity_scores(
200
+ debug = False, # True
201
+ )
202
+
203
+ duration = round(time.time() - start_time, 3)
204
+ print(f"{duration:7.3f} sec: graph of relations")
205
+
206
+ gor.render_gor_plt(_scores)
207
+ plt.show()
208
+
209
+ #sys.exit(0)
210
+
211
+
212
+ ######################################################################
213
+ ## stack profiler report
214
+ if profile:
215
+ profiler.stop()
216
+ profiler.print()
217
+
218
+ ## output lemma graph as JSON
219
+ with open("lemma.json", "w", encoding = "utf-8") as fp:
220
+ fp.write(tg.dump_lemma_graph())
docs/abstract.md ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Introduction
2
+
3
+ **DRAFT** (WIP)
4
+
5
+ The primary goal of this project is to improve semi-automated KG construction from large collections of unstructured text sources, while leveraging feedback from domain experts and maintaining quality checks for the aggregated results.
6
+
7
+ Typical downstream use cases for these KGs include collecting data for industrial optimization use cases based on _operations research_, as mechanisms enabling structured LLM reasoning [#besta2024topo](biblio.md#besta2024topo), and potentially new methods of integrating KG linked data directly into LLM inference [#wen2023mindmap](biblio.md#wen2023mindmap)
8
+
9
+ To this point, this project explores hybrid applications which leverage LLMs to improve _natural language processing_ (NLP) pipeline components, which are also complemented by other deep learning models, graph queries, semantic inference, and related APIs.
10
+
11
+ Notably, LLMs come from NLP research.
12
+ Amidst an overwhelming avalanche of contemporary news headlines, pre-print papers, celebrity researchers, industry pundits, and so on ...
13
+ the hype begs a simple question: how good are LLMs at improving the results of natural language parsing and annotation in practice?
14
+
15
+ Granted, it is possible to use LLM chat interfaces to generate entire KGs from unstructured text sources.
16
+ Results from this brute-force approach tend to be mixed, especially when KGs rely on non-trivial controlled vocabularies and overlapping concepts.
17
+ For examples, see [#lawrence2024ttg](biblio.md#lawrence2024ttg) and [#nizami2023llm](biblio.md#nizami2023llm).
18
+
19
+ Issues with LLM accuracy (e.g., hallucinations) may be partially addressed through use of _retrieval augmented generation_ (RAG).
20
+ Even so, this approach tends to be expensive, especially when large number of PDF documents need to be used as input.
21
+ Use of a fully-automated "black box" based on a LLM chat agent in production use cases also tends to contradict the benefits of curating a KG to collect representations of an organization's domain expertise.
22
+
23
+ There are perhaps some deeper issues implied in this work.
24
+ To leverage "generative AI" for KGs, we must cross multiple boundaries of representation.
25
+ For example, graph ML approaches which start from graph-theoretic descriptions are losing vital information.
26
+ On the one hand, these are generally focused on _node prediction_ or _edge prediction_ tasks, which seems overly reductionist and simplistic in the context of trying to generate streams of _composable elements_ for building graphs.
27
+ On the other hand, these approaches typically get trained on _node embeddings_, _edge embeddings_, or _graph embeddings_ -- which may not quite fit the problem at hand.
28
+ Rolling back even further, the transition from NLP parsing of unstructured text sources to the construction of KGs also tends to throw away a lot of potentially useful annotations and context available from the NLP workflows.
29
+ Commonly accepted means for training LLMs from text sources directly often use tokenization which is relatively naïve about what might be structured within the data, other than linear sequences of characters.
30
+ Notably, this ignores the relationships among surface forms of text and their co-occurence with predicted entities or relations.
31
+ Some contemporary approaches to RAG use "chunked" text, attempting to link between chunks, even though this approach arguably destroys information about what is structured within that input data.
32
+ These multiple disconnects between the source data, the representation methods used in training models, and the tactics employed for applications; however, quite arguably the "applications" targeted in research projects generally stop at comparisons of benchmarks.
33
+ Overall, these disconnects indicate the need for rethinking the problem at multiple points.
34
+
35
+ For industry uses of KGs, one frequent observation from those leading production projects is that the "last mile" of applications generally relies on _operations research_, not ML.
36
+ We must keep these needs in mind when applying "generative AI" approaches to industry use cases.
37
+ Are we developing representations which can subsequently be leveraged for dynamic programming, convex optimization, etc.?
38
+
39
+ This project explores a different definition for "generative AI" in the context of working with KGs for production use cases.
40
+ Rather than pursue an LLM to perform all required tasks, is it possible to combine the use of smaller, more specialized models for specific tasks within the reasonably well-understood process of KG construction?
41
+ In broad strokes, can this work alternative provide counterfactuals to the contemporary trends for chat-based _prompt engineering_?
42
+
43
+ Seeking to integrate results from several other research projects implies substantial amounts of code reuse.
44
+ It would be intractable in terms of time and funding to rewrite code and then re-evaluate models for the many research projects which are within the scope of this work.
45
+ Therefore reproducibilty of published results -- based on open source code, models, evals, etc. -- becomes a crucial factor for determining whether others projects are suitable to be adapted into KG workflows.
46
+
47
+ For the sake of brevity, we do not define all of the terminology used, instead relying on broadly used terms in the literature.
docs/ack.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Acknowledgements
2
+
3
+ <img src="../assets/nouns/community.png" alt="Community by Aneeque Ahmed from the Noun Project" />
4
+
5
+ Contributors:
6
+
7
+ - Jürgen Müller, Zahid Abul-Basher, Nihatha Lathiff, et al., @ BASF
8
+ - open source sponsors for Derwen.ai
9
+ - perspectives from the KùzuDB.com team
10
+ - perspectives from the Argilla.io team
11
+ - feedback and suggestions from participants at [Dagstuhl Seminar 24061](https://www.dagstuhl.de/24061)
docs/assets/favicon.png ADDED
docs/assets/hitl.png ADDED
docs/assets/logo.png ADDED
docs/assets/nouns/api.png ADDED
docs/assets/nouns/biblio.png ADDED
docs/assets/nouns/community.png ADDED
docs/assets/nouns/concepts.png ADDED
docs/assets/nouns/discovery.png ADDED
docs/assets/nouns/evidence.png ADDED
docs/assets/nouns/feedback.png ADDED
docs/assets/nouns/howto.png ADDED
docs/assets/nouns/tutorial.png ADDED
docs/assets/textgraphs.graffle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2177f30434db8dc6534ed39b3f5a9bed3b0fbd00db26afd841f6e77c788910f2
3
+ size 1410392
docs/biblio.md ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Bibliography
2
+
3
+ <img src="../assets/nouns/biblio.png" alt="books by b a r z i n from the Noun Project" />
4
+
5
+ Where possible, the bibliography entries use conventions at
6
+ <https://www.bibsonomy.org/>
7
+ for [*citation keys*](https://bibdesk.sourceforge.io/manual/BibDeskHelp_2.html).
8
+ Journal abbreviations come from
9
+ <https://academic-accelerator.com/Journal-Abbreviation/System>
10
+ based on [*ISO 4*](https://en.wikipedia.org/wiki/ISO_4) standards.
11
+ Links to online versions of cited works use
12
+ [DOI](https://www.doi.org/)
13
+ for [*persistent identifiers*](https://www.crossref.org/education/metadata/persistent-identifiers/).
14
+ When available,
15
+ [*open access*](https://peerj.com/preprints/3119v1/)
16
+ URLs are listed.
17
+
18
+
19
+ ## – A –
20
+
21
+ ### aarsen2023ner
22
+
23
+ ["SpanMarker for Named Entity Recognition"](https://raw.githubusercontent.com/tomaarsen/SpanMarkerNER/main/thesis.pdf)
24
+ **Tom Aarsen**
25
+ *Radboud University* (2023-06-01)
26
+ > A span-level Named Entity Recognition (NER) model that aims to improve performance while reducing computational requirements. SpanMarker leverages special marker tokens and utilizes BERT-style encoders with position IDs and attention mask matrices to capture contextual information effectively.
27
+
28
+ ### auer07dbpedia
29
+
30
+ ["DBpedia: A Nucleus for a Web of Open Data"](https://doi.org/10.1007/978-3-540-76298-0_52)
31
+ **Sören Auer**, **Christian Bizer**, **Georgi Kobilarov**, **Jens Lehmann**, **Richard Cyganiak**, **Zachary Ives**
32
+ *ISWC* (2007-11-11)
33
+ > DBpedia is a community effort to extract structured information from Wikipedia and to make this information available on the Web. DBpedia allows you to ask sophisticated queries against datasets derived from Wikipedia and to link other datasets on the Web to Wikipedia data.
34
+
35
+ ## – B –
36
+
37
+ ### bachbhg17
38
+
39
+ ["Hinge-Loss Markov Random Fields and Probabilistic Soft Logic"](https://arxiv.org/abs/1505.04406)
40
+ **Stephen Bach**, **Matthias Broecheler**, **Bert Huang**, **Lise Getoor**
41
+ *JMLR* (2017–11–17)
42
+ > We introduce two new formalisms for modeling structured data, and show that they can both capture rich structure and scale to big data. The first, hinge-loss Markov random fields (HL-MRFs), is a new kind of probabilistic graphical model that generalizes different approaches to convex inference.
43
+
44
+ ### barrière2016elsf
45
+
46
+ ["Entities, Labels, and Surface Forms"](https://doi.org/10.1007/978-3-319-41337-2_2)
47
+ **Caroline Barrière**
48
+ _Springer_ (2016-11-19)
49
+ > We will look into a first obstacle toward this seemingly simple IE goal: the fact that entities do not have normalized names. Instead, entities can be referred to by many different surface forms.
50
+
51
+ ### besta2024topo
52
+
53
+ ["Topologies of Reasoning: Demystifying Chains, Trees, and Graphs of Thoughts"](https://arxiv.org/abs/2401.14295)
54
+ **Maciej Besta**, **Florim Memedi**, **Zhenyu Zhang**, **Robert Gerstenberger**, **Nils Blach**, **Piotr Nyczyk**, **Marcin Copik**, **Grzegorz Kwasniewski**, **Jurgen Müller**, **Lukas Gianinazzi**, **Ales Kubicek**, **Hubert Niewiadomski**, **Onur Mutlu**, **Torsten Hoefler**
55
+ _ETH Zurich_ (2024-01-25)
56
+ > Introducing a blueprint and an accompanying taxonomy of prompting schemes, focusing on the underlying structure of reasoning.
57
+
58
+ ## – C –
59
+
60
+ ### cabot2023redfm
61
+
62
+ ["RED<sup>FM</sup>: a Filtered and Multilingual Relation Extraction Dataset"](https://arxiv.org/abs/2306.09802)
63
+ **Pere-Lluís Huguet Cabot**, **Simone Tedeschi**, **Axel-Cyrille Ngonga Ngomo**, **Roberto Navigli**
64
+ _ACL_ (2023-06-19)
65
+ > Relation Extraction (RE) is a task that identifies relationships between entities in a text, enabling the acquisition of relational facts and bridging the gap between natural language and structured knowledge. However, current RE models often rely on small datasets with low coverage of relation types, particularly when working with languages other than English. In this paper, we address the above issue and provide two new resources that enable the training and evaluation of multilingual RE systems.
66
+
67
+ ## – E –
68
+
69
+ ### erxlebengkmv14
70
+
71
+ ["Introducing Wikidata to the Linked Data Web"](https://doi.org/10.1007/978-3-319-11964-9_4)
72
+ **Fredo Erxleben**, **Michael Günther**, **Markus Krötzsch**, **Julian Mendez**, **Denny Vrandečić**
73
+ _ISWC_ (2014-10-19)
74
+ > We introduce new RDF exports that connect Wikidata to the Linked Data Web. We explain the data model of Wikidata and discuss its encoding in RDF. Moreover, we introduce several partial exports that provide more selective or simplified views on the data.
75
+
76
+ ## – F –
77
+
78
+ ### feng2023kuzu
79
+
80
+ ["KÙZU Graph Database Management System"](https://www.cidrdb.org/cidr2023/papers/p48-jin.pdf)
81
+ **Xiyang Feng**, **Guodong Jin**, **Ziyi Chen**, **Chang Liu**, **Semih Salihoğlu**
82
+ _CIDR_ (2023-01-08)
83
+ > We present Kùzu, a new GDBMS we are developing at University of Waterloo that aims to integrate state-of-art storage, indexing, and query processing techniques to highly optimize for this feature set.
84
+
85
+ ## – G –
86
+
87
+ ### galkin2023ultra
88
+
89
+ ["Towards Foundation Models for Knowledge Graph Reasoning"](https://arxiv.org/abs/2310.04562)
90
+ **Mikhail Galkin**, **Xinyu Yuan**, **Hesham Mostafa**, **Jian Tang**, **Zhaocheng Zhu**
91
+ preprint (2023–10–06)
92
+ > ULTRA builds relational representations as a function conditioned on their interactions. Such a conditioning strategy allows a pre-trained ULTRA model to inductively generalize to any unseen KG with any relation vocabulary and to be fine-tuned on any graph.
93
+
94
+ ## – H –
95
+
96
+ ### hagberg2008
97
+
98
+ ["Exploring network structure, dynamics, and function using NetworkX"](https://conference.scipy.org/proceedings/SciPy2008/paper_2/)
99
+ **Aric A. Hagberg**, **Daniel A. Schult**, **Pieter J. Swart**
100
+ _SciPy2008_ (2008-08-19)
101
+ > NetworkX is a Python language package for exploration and analysis of networks and network algorithms. The core package provides data structures for representing many types of networks, or graphs, including simple graphs, directed graphs, and graphs with parallel edges and self loops.
102
+
103
+ ### hahnr88
104
+
105
+ ["Automatic generation of hypertext knowledge bases"](https://doi.org/10.1145/966861.45429)
106
+ **Udo Hahn**, **Ulrich Reimer**
107
+ _ACM SIGOIS_ 9:2 (1988-04-01)
108
+ > The condensation process transforms the text representation structures resulting from the text parse into a more abstract thematic description of what the text is about, filtering out irrelevant knowledge structures and preserving only the most salient concepts.
109
+
110
+ ### hamilton2020grl
111
+
112
+ [_Graph Representation Learning_](https://www.cs.mcgill.ca/~wlh/grl_book/)
113
+ **William Hamilton**
114
+ Morgan and Claypool (pre-print 2020)
115
+ > A brief but comprehensive introduction to graph representation learning, including methods for embedding graph data, graph neural networks, and deep generative models of graphs.
116
+
117
+ ### hangyyls19
118
+
119
+ ["OpenNRE: An Open and Extensible Toolkit for Neural Relation Extraction"](https://doi.org/10.18653/v1/D19-3029)
120
+ **Xu Han**, **Tianyu Gao**, **Yuan Yao**, **Deming Ye**, **Zhiyuan Liu**, **Maosong Sun**
121
+ *EMNLP* (2019-11-03)
122
+ > OpenNRE is an open-source and extensible toolkit that provides a unified framework to implement neural models for relation extraction (RE).
123
+
124
+ ### hartig14
125
+
126
+ ["Reconciliation of RDF* and Property Graphs"](https://arxiv.org/abs/1409.3288)
127
+ **Olaf Hartig**
128
+ _CoRR_ (2014-11-14)
129
+ > The document proposes a formalization of the PG model and introduces well-defined transformations between PGs and RDF.
130
+
131
+ ### honnibal2020spacy
132
+
133
+ ["spaCy: Industrial-strength Natural Language Processing in Python"](https://doi.org/10.5281/zenodo.1212303)
134
+ **Matthew Honnibal**, **Ines Montani**, **Sofie Van Landeghem**, **Adriane Boyd**
135
+ *Explosion AI* (2016-10-18)
136
+ > spaCy is a library for advanced Natural Language Processing in Python and Cython. It's built on the very latest research, and was designed from day one to be used in real products.
137
+
138
+ ## – L –
139
+
140
+ ### lee2023ingram
141
+
142
+ ["InGram: Inductive Knowledge Graph Embedding via Relation Graphs"](https://arxiv.org/abs/2305.19987)
143
+ **Jaejun Lee**, **Chanyoung Chung**, **Joyce Jiyoung Whang**
144
+ _ICML_ (2023–08–17)
145
+ > In this paper, we propose an INductive knowledge GRAph eMbedding method, InGram, that can generate embeddings of new relations as well as new entities at inference time.
146
+
147
+ ### loganlpgs19
148
+
149
+ ["Barack's Wife Hillary: Using Knowledge-Graphs for Fact-Aware Language Modeling"](https://arxiv.org/abs/1906.07241)
150
+ **Robert L. Logan IV**, **Nelson F. Liu**, **Matthew E. Peters**, **Matt Gardner**, **Sameer Singh**
151
+ _ACL_ (2019-06-20)
152
+ > We introduce the knowledge graph language model (KGLM), a neural language model with mechanisms for selecting and copying facts from a knowledge graph that are relevant to the context.
153
+
154
+ ## – M –
155
+
156
+ ### martonsv17
157
+
158
+ ["Formalising openCypher Graph Queries in Relational Algebra"](https://doi.org/10.1007/978-3-319-66917-5_13)
159
+ **József Marton**, **Gábor Szárnyas**, **Dániel Varró**
160
+ _ADBIS_ (2017-08-25)
161
+ > We present a formal specification for openCypher, a high-level declarative graph query language with an ongoing standardisation effort.
162
+
163
+ ### mihalcea04textrank
164
+
165
+ ["TextRank: Bringing Order into Text"](https://www.aclweb.org/anthology/W04-3252/)
166
+ **Rada Mihalcea**, **Paul Tarau**
167
+ *EMNLP* pp. 404-411 (2004-07-25)
168
+ > In this paper, the authors introduce TextRank, a graph-based ranking model for text processing, and show how this model can be successfully used in natural language applications.
169
+
170
+ ## – N –
171
+
172
+ ### nathan2016ptr
173
+
174
+ ["PyTextRank, a Python implementation of TextRank for phrase extraction and summarization of text documents"](https://doi.org/10.5281/zenodo.4637885)
175
+ **Paco Nathan**, et al.
176
+ *Derwen* (2016-10-03)
177
+ > Python implementation of TextRank algorithms ("textgraphs") for phrase extraction
178
+
179
+ ### nathan2023glod
180
+
181
+ ["Graph Levels of Detail"](https://blog.derwen.ai/graph-levels-of-detail-ea4226abba55)
182
+ **Paco Nathan**
183
+ *Derwen* (2023-11-12)
184
+ > How can we work with graph data in more abstracted, aggregate perspectives? While we can run queries on graph data to compute aggregate measures, we don’t have programmatic means of “zooming out” to consider a large graph the way that one zooms out when using an online map.
185
+
186
+ ## - Q -
187
+
188
+ ### qin2023sgr
189
+
190
+ ["Semantic Random Walk for Graph Representation Learning in Attributed Graphs"](https://arxiv.org/abs/2305.06531)
191
+ **Meng Qin**
192
+ *Hong Kong University of Science and Technology* (2023-05-11)
193
+ > We introduced a novel SGR method to generally formulate the network embedding in attributed graphs as a high-order proximity based embedding task of an auxilairy weighted graph with heterogeneous entities.
194
+
195
+ ### qin2024irwe
196
+
197
+ ["IRWE: Inductive Random Walk for Joint Inference of Identity and Position Network Embedding"](https://arxiv.org/abs/2401.00651)
198
+ **Meng Qin**, **Dit-Yan Yeung**
199
+ *Hong Kong University of Science and Technology* (2024-01-01)
200
+ > Since nodes in a community should be densely connected, nodes within the same community are more likely to be reached via RWs compared with those in different communities. Therefore, nodes with similar positions (e.g., in the same community) are highly believed to have similar RW statistics.
201
+
202
+ ## - R -
203
+
204
+ ### ramage2009rwt
205
+
206
+ ["Random walks for text semantic similarity"](https://dl.acm.org/doi/10.5555/1708124.1708131)
207
+ **Daniel Ramage**, **Anna Rafferty**, **Christopher Manning**
208
+ _ACL-IJCNLP_ (2009-09-07)
209
+ > Our algorithm aggregates local relatedness information via a random walk over a graph constructed from an underlying lexical resource. The stationary distribution of the graph walk forms a “semantic signature” that can be compared to another such distribution to get a relatedness score for texts.
210
+
211
+ ## – W –
212
+
213
+ ### warmerdam2023pydata
214
+
215
+ ["Natural Intelligence is All You Need™"](https://youtu.be/C9p7suS-NGk?si=7Ohq3BV654ia2Im4)
216
+ **Vincent Warmerdam**
217
+ *PyData Amsterdam* (2023-09-15)
218
+ > In this talk I will try to show you what might happen if you allow yourself the creative freedom to rethink and reinvent common practices once in a while. As it turns out, in order to do that, natural intelligence is all you need. And we may start needing a lot of it in the near future.
219
+
220
+ ### wen2023mindmap
221
+
222
+ ["MindMap: Knowledge Graph Prompting Sparks Graph of Thoughts in Large Language Models"](https://arxiv.org/abs/2308.09729)
223
+ **Yilin Wen**, **Zifeng Wang**, **Jimeng Sun**
224
+ _arXiv_ (2023-08-17)
225
+ > We build a prompting pipeline that endows LLMs with the capability of comprehending KG inputs and inferring with a combined implicit knowledge and the retrieved external knowledge.
226
+
227
+ ### wolf2020transformers
228
+
229
+ ["Transformers: State-of-the-Art Natural Language Processing"](https://doi.org/10.18653/v1/2020.emnlp-demos.6)
230
+ **Thomas Wolf**, **Lysandre Debut**, **Victor Sanh**, **Julien Chaumond**, **Clement Delangue**, **Anthony Moi**, **Pierric Cistac**, **Tim Rault**, **Remi Louf**, **Morgan Funtowicz**, **Joe Davison**, **Sam Shleifer**, **Patrick von Platen**, **Clara Ma**, **Yacine Jernite**, **Julien Plu**, **Canwen Xu**, **Teven Le Scao**, **Sylvain Gugger**, **Mariama Drame**, **Quentin Lhoest**, **Alexander Rush**
231
+ *EMNLP* (2020-11-16)
232
+ > The library consists of carefully engineered state-of-the art Transformer architectures under a unified API. Backing this library is a curated collection of pretrained models made by and available for the community.
docs/build.md ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Build Instructions
2
+
3
+ <img src="../assets/nouns/api.png" alt="API by Adnen Kadri from the Noun Project" />
4
+
5
+ !!! note
6
+ In most cases you won't need to build this package locally.
7
+
8
+ Unless you're doing development work on the **textgraphs** library itself,
9
+ simply install based on the instructions in
10
+ ["Getting Started"](https://derwen.ai/docs/txg/start/).
11
+
12
+
13
+ ## Setup
14
+
15
+ To set up the build environment locally:
16
+ ```
17
+ python3 -m venv venv
18
+ source venv/bin/activate
19
+ python3 -m pip install -U pip wheel setuptools
20
+
21
+ python3 -m pip install -e .
22
+ python3 -m pip install -r requirements-dev.txt
23
+ ```
24
+
25
+ We use *pre-commit hooks* based on [`pre-commit`](https://pre-commit.com/)
26
+ and to configure that locally:
27
+ ```
28
+ pre-commit install --hook-type pre-commit
29
+ ```
30
+
31
+
32
+ ## Test Coverage
33
+
34
+ This project uses
35
+ [`pytest`](https://docs.pytest.org/)
36
+ for *unit test* coverage.
37
+ Source for unit tests is in the
38
+ [`tests`](https://github.com/DerwenAI/textgraphs/tree/main/tests)
39
+ subdirectory.
40
+
41
+ To run the unit tests:
42
+ ```
43
+ python3 -m pytest
44
+ ```
45
+
46
+ Note that these tests run as part of the CI workflow
47
+ whenever code is updated on the GitHub repo.
48
+
49
+
50
+ ## Online Documentation
51
+
52
+ To generate documentation pages, you will also need to download
53
+ [`ChromeDriver`](https://googlechromelabs.github.io/chrome-for-testing/)
54
+ for your version of the `Chrome` browser, saved as `chromedriver` in
55
+ this directory.
56
+
57
+ Source for the documentation is in the
58
+ [`docs`](https://github.com/DerwenAI/textgraphs/tree/main/docs)
59
+ subdirectory.
60
+
61
+ To build the documentation:
62
+ ```
63
+ ./bin/nb_md.sh
64
+ ./pkg_doc.py docs/ref.md
65
+ mkdocs build
66
+ ```
67
+
68
+ Then run `./bin/preview.py` and load <http://127.0.0.1:8000/docs/>
69
+ in your browser to preview the generated microsite locally.
70
+
71
+ To package the generated microsite for deployment on a
72
+ web server:
73
+ ```
74
+ tar cvzf txg.tgz site/
75
+ ```
76
+
77
+
78
+ ## Remote Repo Updates
79
+
80
+ To update source code repo on GitHub:
81
+
82
+ ```
83
+ git remote set-url origin https://github.com/DerwenAI/textgraphs.git
84
+ git push
85
+ ```
86
+
87
+ Create new releases on GitHub then run `git pull` locally prior to
88
+ updating Hugging Face or making a new package release.
89
+
90
+ To update source code repo+demo on Hugging Face:
91
+
92
+ ```
93
+ git remote set-url origin https://huggingface.co/spaces/DerwenAI/textgraphs
94
+ git push
95
+ ```
96
+
97
+
98
+ ## Package Release
99
+
100
+ To update the [release on PyPi](https://pypi.org/project/textgraphs/):
101
+ ```
102
+ ./bin/push_pypi.sh
103
+ ```
104
+
105
+
106
+ ## Packaging
107
+
108
+ Both the spaCy and PyPi teams induce packaging errors since they
109
+ have "opinionated" views which conflict against each other and also
110
+ don't quite follow the [Python packaging standards](https://peps.python.org/pep-0621/).
111
+
112
+ Moreover, the various dependencies here use a wide range of approaches
113
+ for model downloads: quite appropriately, the spaCy team does not want
114
+ to package their language models on PyPi.
115
+ However, they don't use more contemporary means of model download,
116
+ such as HF transformers, either -- and that triggers logging problems.
117
+ Overall, logging approaches used by the dependencies here for errors/warnings
118
+ are mostly ad-hoc.
119
+
120
+ These three issues (packaging, model downloads, logging) pose a small nightmare
121
+ for managing Python library packaging downstream.
122
+ To that point, this project implements several workarounds so that
123
+ applications can download from PyPi.
124
+
125
+ Meanwhile keep watch on developments of the following dependencies,
126
+ if they introduce breaking changes or move toward more standard
127
+ packaging practices:
128
+
129
+ * `spaCy` -- model downloads, logging
130
+ * `OpenNRE` -- PyPi packaging, logging
131
+ * HF `transformers` and `tokenizers` -- logging
132
+ * WikiMedia APIs -- SSL certificate expiry
docs/conclude.md ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Conclusions
2
+
3
+ **DRAFT** (WIP)
4
+
5
+ `TextGraphs` library provides a highly configurable and extensible open source Python library for the integration and evaluation of several LLM components. This has been built with attention to allowing for concurrency and parallelism for high-performance computing on distributed systems.
6
+
7
+ TODO:
8
+
9
+ - leverage co-reference
10
+ - leverage closure constrained by domain/range
11
+ - general => specific, uncertain => confident
12
+
13
+ The state of _relation extraction_ is arguably immature.
14
+ While the papers in this area compare against benchmarks, their training datasets mostly have been built from Wikidata sources, and inferred relations result in _labels_ not IRIs.
15
+ This precludes downstream use of the inferred relations for semantic inference.
16
+ Ultimately, how can better training data be developed -- e.g., for relation extraction -- to improve large models used in constructing/augmenting knowledge graphs?
17
+
18
+ ## Questions for Follow Up Research
19
+
20
+ Many existing projects produce results which are **descriptive, but not computable**.
21
+ However, given recent innovations, such as _DPO_, there appear to be many opportunities for reworking the training datasets used in
22
+ NRE and RE models, following the pattern of `Notus`
23
+
24
+ **R1**: we have demonstrated how to leverage LLM components while emphasizing HITL (domain experts) and quality of results
25
+
26
+
27
+ **R2**: we have suggested areas where investments in data quality
28
+ may provide substantial gains
29
+
30
+ One key take-away from this project is that the model deployments are relatively haphazard across a wide spectrum of performance: some of the open source dependencies use efficient frameworks such as Hugging Face `transformers` to load models, while others use ad-hoc approaches which are much less performant.
31
+
32
+ Granted, use of LLMs and other deep learning models is expected to increase computational requirements substantially.
33
+ Given the integration of APIs, the compute, memory, and network requirements for running the `TextGraphs` library in product can be quite large.
34
+ Software engineering optimizations can reduce these requirements substantially through use of hardware acceleration, localized services, proxy/caching, and concurrency.
35
+
36
+ However, a more effective approach would be to make investments in data quality (training datasets, benchmarks, evals, etc.) for gains within the core technologies used here: NER, RE, etc.
37
+ Data-first iterations on the model dependencies can alleviate much of this problem.
38
+
39
+
40
+ **R3**: we have proposed a rubric for evaluating/rating ML open source
41
+ w.r.t. production use cases
42
+
43
+ This project integrates available open source projects across a wide range of NLP topics.
44
+ Perspectives were gained from evaluating many open source LLM projects related to NLP components, and the state of readiness for their use in production libraries overall.
45
+
46
+ Note that reproducibility rates are abysmally low for open source which accompanies machine learning research papers.
47
+ Few project install correctly, and fewer still run without exceptions.
48
+ Even among the better available OSS project for a given research topic (e.g., _graph embeddings_, _relation extraction_) tend to not have been maintained for years. Of the projects which run, few reproduce their published results, and most are oriented toward command-line (CLI) use to prove specific benchmarks claims.
49
+ These tend to be difficult to rework into production-quality libraries, due to concerns about performance, security, licensing, etc.
50
+
51
+ As an outcome of this inquiry, this project presents a rubric for evaluating research papers and their associated code, based on reproducibility and eventual usefulness in software implementations.
52
+
53
+ The views expressed are those of the authors and do not reflect the official policy or position of the funding organizations.
docs/details.md ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ This project Implements an LLM-augmented `textgraph` algorithm for
2
+ constructing a _lemma graph_ from raw, unstructured text source.
3
+
4
+ The `TextGraphs` library is based on work developed by
5
+ [Derwen](https://derwen.ai/graph)
6
+ in 2023 Q2 for customer apps and used in our `Cysoni`
7
+ product.
8
+
9
+ This library integrates code from:
10
+
11
+ * [`SpanMarker`](https://github.com/tomaarsen/SpanMarkerNER/)
12
+ * [`spaCy-DBpedia-Spotlight`](https://github.com/MartinoMensio/spacy-dbpedia-spotlight)
13
+ * [`REBEL`](https://github.com/Babelscape/rebel)
14
+ * [`OpenNRE`](https://github.com/thunlp/OpenNRE/)
15
+ * [`qwikidata`](https://github.com/kensho-technologies/qwikidata)
16
+ * [`pulp`](https://github.com/coin-or/pulp)
17
+ * [`spaCy`](https://spacy.io/)
18
+ * [`HF transformers`](https://huggingface.co/docs/transformers/index)
19
+ * [`PyTextRank`](https://github.com/DerwenAI/pytextrank/)
20
+
21
+
22
+ For more background about early efforts which led to this line of inquiry, see the recent talks:
23
+
24
+ * ["Language, Graphs, and AI in Industry"](https://derwen.ai/s/mqqm)
25
+ **Paco Nathan**, K1st World (2023-10-11) ([video](https://derwen.ai/s/4h2kswhrm3gc))
26
+ * ["Language Tools for Creators"](https://derwen.ai/s/rhvg)
27
+ **Paco Nathan**, FOSSY (2023-07-13)
28
+
29
+
30
+ The `TextGraphs` library shows integrations of several of these kinds
31
+ of components, complemented with use of graph queries, graph algorithms,
32
+ and other related tooling.
33
+ Admittedly, the results present a "hybrid" approach:
34
+ it's not purely "generative" -- whatever that might mean.
35
+
36
+ A core principle here is to provide results from the natural language
37
+ workflows which may be used for expert feedback.
38
+ In other words, how can we support means for leveraging
39
+ _human-in-the-loop_ (HITL) process?
40
+
41
+ Another principle has been to create a Python library built to produced
42
+ configurable, extensible pipelines.
43
+ Care has been given to writing code that can be run concurrently
44
+ (e.g., leveraging `asyncio`), using dependencies which have
45
+ business-friendly licenses, and paying attention to security concerns.
46
+
47
+ The library provides three main affordances for AI applications:
48
+
49
+ 1. With the default settings, one can use `TextGraphs` to extracti ranked key phrases from raw text -- even without using any of the additional deep learning models.
50
+
51
+ 2. Going a few further steps, one can generate an RDF or LPG graph from raw texts, and make use of _entity linking_, _relation extraction_, and other techniques to ground the natural language parsing by leveraging some knowledge graph which represents a particular domain. Default examples use WikiMedia graphs: DBPedia, Wikidata, etc.
52
+
53
+ 3. A third set of goals for `TextGraphs` is to provide a "playground" or "gym" for evaluating _graph levels of detail_, i.e., abstraction layers for knowledge graphs, and explore some the emerging work to produced _foundation models_ for knowledge graphs through topological transforms.
54
+
55
+ Regarding the third point, consider how language parsing produces
56
+ graphs by definition, although NLP results tend to be quite _noisy_.
57
+ The annotations inferred by NLP pipelines often get thrown out.
58
+ This seemed like a good opportunity to generate sample data for
59
+ "condensing" graphs into more abstracted representations.
60
+ In other words, patterns within the relatively noisy parse results
61
+ can be condensed into relatively refined knowledge graph elements.
62
+
63
+ Note that while the `spaCy` library for NLP plays a central role, the
64
+ `TextGraphs` library is not intended to become a `spaCy` pipeline.
docs/ex0_0.md ADDED
@@ -0,0 +1,689 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ !!! note
4
+ To run this notebook in JupyterLab, load [`examples/ex0_0.ipynb`](https://github.com/DerwenAI/textgraphs/blob/main/examples/ex0_0.ipynb)
5
+
6
+
7
+
8
+ # demo: TextGraphs + LLMs to construct a 'lemma graph'
9
+
10
+ _TextGraphs_ library is intended for iterating through a sequence of paragraphs.
11
+
12
+ ## environment
13
+
14
+
15
+ ```python
16
+ from IPython.display import display, HTML, Image, SVG
17
+ import pathlib
18
+ import typing
19
+
20
+ from icecream import ic
21
+ from pyinstrument import Profiler
22
+ import matplotlib.pyplot as plt
23
+ import pandas as pd
24
+ import pyvis
25
+ import spacy
26
+
27
+ import textgraphs
28
+ ```
29
+
30
+
31
+ ```python
32
+ %load_ext watermark
33
+ ```
34
+
35
+
36
+ ```python
37
+ %watermark
38
+ ```
39
+
40
+ Last updated: 2024-01-16T17:41:51.229985-08:00
41
+
42
+ Python implementation: CPython
43
+ Python version : 3.10.11
44
+ IPython version : 8.20.0
45
+
46
+ Compiler : Clang 13.0.0 (clang-1300.0.29.30)
47
+ OS : Darwin
48
+ Release : 21.6.0
49
+ Machine : x86_64
50
+ Processor : i386
51
+ CPU cores : 8
52
+ Architecture: 64bit
53
+
54
+
55
+
56
+
57
+ ```python
58
+ %watermark --iversions
59
+ ```
60
+
61
+ sys : 3.10.11 (v3.10.11:7d4cc5aa85, Apr 4 2023, 19:05:19) [Clang 13.0.0 (clang-1300.0.29.30)]
62
+ spacy : 3.7.2
63
+ pandas : 2.1.4
64
+ matplotlib: 3.8.2
65
+ textgraphs: 0.5.0
66
+ pyvis : 0.3.2
67
+
68
+
69
+
70
+ ## parse a document
71
+
72
+ provide the source text
73
+
74
+
75
+ ```python
76
+ SRC_TEXT: str = """
77
+ Werner Herzog is a remarkable filmmaker and an intellectual originally from Germany, the son of Dietrich Herzog.
78
+ After the war, Werner fled to America to become famous.
79
+ """
80
+ ```
81
+
82
+ set up the statistical stack profiling
83
+
84
+
85
+ ```python
86
+ profiler: Profiler = Profiler()
87
+ profiler.start()
88
+ ```
89
+
90
+ set up the `TextGraphs` pipeline
91
+
92
+
93
+ ```python
94
+ tg: textgraphs.TextGraphs = textgraphs.TextGraphs(
95
+ factory = textgraphs.PipelineFactory(
96
+ spacy_model = textgraphs.SPACY_MODEL,
97
+ ner = None,
98
+ kg = textgraphs.KGWikiMedia(
99
+ spotlight_api = textgraphs.DBPEDIA_SPOTLIGHT_API,
100
+ dbpedia_search_api = textgraphs.DBPEDIA_SEARCH_API,
101
+ dbpedia_sparql_api = textgraphs.DBPEDIA_SPARQL_API,
102
+ wikidata_api = textgraphs.WIKIDATA_API,
103
+ min_alias = textgraphs.DBPEDIA_MIN_ALIAS,
104
+ min_similarity = textgraphs.DBPEDIA_MIN_SIM,
105
+ ),
106
+ infer_rels = [
107
+ textgraphs.InferRel_OpenNRE(
108
+ model = textgraphs.OPENNRE_MODEL,
109
+ max_skip = textgraphs.MAX_SKIP,
110
+ min_prob = textgraphs.OPENNRE_MIN_PROB,
111
+ ),
112
+ textgraphs.InferRel_Rebel(
113
+ lang = "en_XX",
114
+ mrebel_model = textgraphs.MREBEL_MODEL,
115
+ ),
116
+ ],
117
+ ),
118
+ )
119
+
120
+ pipe: textgraphs.Pipeline = tg.create_pipeline(
121
+ SRC_TEXT.strip(),
122
+ )
123
+ ```
124
+
125
+ ## visualize the parse results
126
+
127
+
128
+ ```python
129
+ spacy.displacy.render(
130
+ pipe.ner_doc,
131
+ style = "ent",
132
+ jupyter = True,
133
+ )
134
+ ```
135
+
136
+
137
+ <span class="tex2jax_ignore"><div class="entities" style="line-height: 2.5; direction: ltr">
138
+ <mark class="entity" style="background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
139
+ Werner Herzog
140
+ <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">PERSON</span>
141
+ </mark>
142
+ is a remarkable filmmaker and an intellectual originally from
143
+ <mark class="entity" style="background: #feca74; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
144
+ Germany
145
+ <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">GPE</span>
146
+ </mark>
147
+ , the son of
148
+ <mark class="entity" style="background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
149
+ Dietrich Herzog
150
+ <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">PERSON</span>
151
+ </mark>
152
+ .<br>After the war,
153
+ <mark class="entity" style="background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
154
+ Werner
155
+ <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">PERSON</span>
156
+ </mark>
157
+ fled to
158
+ <mark class="entity" style="background: #feca74; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
159
+ America
160
+ <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">GPE</span>
161
+ </mark>
162
+ to become famous.</div></span>
163
+
164
+
165
+
166
+ ```python
167
+ parse_svg: str = spacy.displacy.render(
168
+ pipe.ner_doc,
169
+ style = "dep",
170
+ jupyter = False,
171
+ )
172
+
173
+ display(SVG(parse_svg))
174
+ ```
175
+
176
+
177
+
178
+ ![svg](ex0_0_files/ex0_0_17_0.svg)
179
+
180
+
181
+
182
+ ## collect graph elements from the parse
183
+
184
+
185
+ ```python
186
+ tg.collect_graph_elements(
187
+ pipe,
188
+ debug = False,
189
+ )
190
+ ```
191
+
192
+
193
+ ```python
194
+ ic(len(tg.nodes.values()));
195
+ ic(len(tg.edges.values()));
196
+ ```
197
+
198
+ ic| len(tg.nodes.values()): 36
199
+ ic| len(tg.edges.values()): 42
200
+
201
+
202
+ ## perform entity linking
203
+
204
+
205
+ ```python
206
+ tg.perform_entity_linking(
207
+ pipe,
208
+ debug = False,
209
+ )
210
+ ```
211
+
212
+ ## infer relations
213
+
214
+
215
+ ```python
216
+ inferred_edges: list = await tg.infer_relations_async(
217
+ pipe,
218
+ debug = False,
219
+ )
220
+
221
+ inferred_edges
222
+ ```
223
+
224
+
225
+
226
+
227
+ [Edge(src_node=0, dst_node=10, kind=<RelEnum.INF: 2>, rel='https://schema.org/nationality', prob=1.0, count=1),
228
+ Edge(src_node=15, dst_node=0, kind=<RelEnum.INF: 2>, rel='https://schema.org/children', prob=1.0, count=1),
229
+ Edge(src_node=27, dst_node=22, kind=<RelEnum.INF: 2>, rel='https://schema.org/event', prob=1.0, count=1)]
230
+
231
+
232
+
233
+ ## construct a lemma graph
234
+
235
+
236
+ ```python
237
+ tg.construct_lemma_graph(
238
+ debug = False,
239
+ )
240
+ ```
241
+
242
+ ## extract ranked entities
243
+
244
+
245
+ ```python
246
+ tg.calc_phrase_ranks(
247
+ pr_alpha = textgraphs.PAGERANK_ALPHA,
248
+ debug = False,
249
+ )
250
+ ```
251
+
252
+ show the resulting entities extracted from the document
253
+
254
+
255
+ ```python
256
+ df: pd.DataFrame = tg.get_phrases_as_df()
257
+ df
258
+ ```
259
+
260
+
261
+
262
+
263
+ <div>
264
+ <style scoped>
265
+ .dataframe tbody tr th:only-of-type {
266
+ vertical-align: middle;
267
+ }
268
+
269
+ .dataframe tbody tr th {
270
+ vertical-align: top;
271
+ }
272
+
273
+ .dataframe thead th {
274
+ text-align: right;
275
+ }
276
+ </style>
277
+ <table border="1" class="dataframe">
278
+ <thead>
279
+ <tr style="text-align: right;">
280
+ <th></th>
281
+ <th>node_id</th>
282
+ <th>text</th>
283
+ <th>pos</th>
284
+ <th>label</th>
285
+ <th>count</th>
286
+ <th>weight</th>
287
+ </tr>
288
+ </thead>
289
+ <tbody>
290
+ <tr>
291
+ <th>0</th>
292
+ <td>0</td>
293
+ <td>Werner Herzog</td>
294
+ <td>PROPN</td>
295
+ <td>dbr:Werner_Herzog</td>
296
+ <td>1</td>
297
+ <td>0.080547</td>
298
+ </tr>
299
+ <tr>
300
+ <th>1</th>
301
+ <td>10</td>
302
+ <td>Germany</td>
303
+ <td>PROPN</td>
304
+ <td>dbr:Germany</td>
305
+ <td>1</td>
306
+ <td>0.080437</td>
307
+ </tr>
308
+ <tr>
309
+ <th>2</th>
310
+ <td>15</td>
311
+ <td>Dietrich Herzog</td>
312
+ <td>PROPN</td>
313
+ <td>dbo:Person</td>
314
+ <td>1</td>
315
+ <td>0.079048</td>
316
+ </tr>
317
+ <tr>
318
+ <th>3</th>
319
+ <td>27</td>
320
+ <td>America</td>
321
+ <td>PROPN</td>
322
+ <td>dbr:United_States</td>
323
+ <td>1</td>
324
+ <td>0.079048</td>
325
+ </tr>
326
+ <tr>
327
+ <th>4</th>
328
+ <td>24</td>
329
+ <td>Werner</td>
330
+ <td>PROPN</td>
331
+ <td>dbo:Person</td>
332
+ <td>1</td>
333
+ <td>0.077633</td>
334
+ </tr>
335
+ <tr>
336
+ <th>5</th>
337
+ <td>4</td>
338
+ <td>filmmaker</td>
339
+ <td>NOUN</td>
340
+ <td>owl:Thing</td>
341
+ <td>1</td>
342
+ <td>0.076309</td>
343
+ </tr>
344
+ <tr>
345
+ <th>6</th>
346
+ <td>22</td>
347
+ <td>war</td>
348
+ <td>NOUN</td>
349
+ <td>owl:Thing</td>
350
+ <td>1</td>
351
+ <td>0.076309</td>
352
+ </tr>
353
+ <tr>
354
+ <th>7</th>
355
+ <td>32</td>
356
+ <td>a remarkable filmmaker</td>
357
+ <td>noun_chunk</td>
358
+ <td>None</td>
359
+ <td>1</td>
360
+ <td>0.076077</td>
361
+ </tr>
362
+ <tr>
363
+ <th>8</th>
364
+ <td>7</td>
365
+ <td>intellectual</td>
366
+ <td>NOUN</td>
367
+ <td>owl:Thing</td>
368
+ <td>1</td>
369
+ <td>0.074725</td>
370
+ </tr>
371
+ <tr>
372
+ <th>9</th>
373
+ <td>13</td>
374
+ <td>son</td>
375
+ <td>NOUN</td>
376
+ <td>owl:Thing</td>
377
+ <td>1</td>
378
+ <td>0.074725</td>
379
+ </tr>
380
+ <tr>
381
+ <th>10</th>
382
+ <td>33</td>
383
+ <td>an intellectual</td>
384
+ <td>noun_chunk</td>
385
+ <td>None</td>
386
+ <td>1</td>
387
+ <td>0.074606</td>
388
+ </tr>
389
+ <tr>
390
+ <th>11</th>
391
+ <td>34</td>
392
+ <td>the son</td>
393
+ <td>noun_chunk</td>
394
+ <td>None</td>
395
+ <td>1</td>
396
+ <td>0.074606</td>
397
+ </tr>
398
+ <tr>
399
+ <th>12</th>
400
+ <td>35</td>
401
+ <td>the war</td>
402
+ <td>noun_chunk</td>
403
+ <td>None</td>
404
+ <td>1</td>
405
+ <td>0.074606</td>
406
+ </tr>
407
+ </tbody>
408
+ </table>
409
+ </div>
410
+
411
+
412
+
413
+ ## visualize the lemma graph
414
+
415
+
416
+ ```python
417
+ render: textgraphs.RenderPyVis = tg.create_render()
418
+
419
+ pv_graph: pyvis.network.Network = render.render_lemma_graph(
420
+ debug = False,
421
+ )
422
+ ```
423
+
424
+ initialize the layout parameters
425
+
426
+
427
+ ```python
428
+ pv_graph.force_atlas_2based(
429
+ gravity = -38,
430
+ central_gravity = 0.01,
431
+ spring_length = 231,
432
+ spring_strength = 0.7,
433
+ damping = 0.8,
434
+ overlap = 0,
435
+ )
436
+
437
+ pv_graph.show_buttons(filter_ = [ "physics" ])
438
+ pv_graph.toggle_physics(True)
439
+ ```
440
+
441
+
442
+ ```python
443
+ pv_graph.prep_notebook()
444
+ pv_graph.show("tmp.fig01.html")
445
+ ```
446
+
447
+ tmp.fig01.html
448
+
449
+
450
+
451
+
452
+
453
+
454
+ ![png](ex0_0_files/tmp.fig01.png)
455
+
456
+
457
+
458
+
459
+ ## generate a word cloud
460
+
461
+
462
+ ```python
463
+ wordcloud = render.generate_wordcloud()
464
+ display(wordcloud.to_image())
465
+ ```
466
+
467
+
468
+
469
+ ![png](ex0_0_files/ex0_0_37_0.png)
470
+
471
+
472
+
473
+ ## cluster communities in the lemma graph
474
+
475
+ In the tutorial
476
+ <a href="https://towardsdatascience.com/how-to-convert-any-text-into-a-graph-of-concepts-110844f22a1a" target="_blank">"How to Convert Any Text Into a Graph of Concepts"</a>,
477
+ Rahul Nayak uses the
478
+ <a href="https://en.wikipedia.org/wiki/Girvan%E2%80%93Newman_algorithm"><em>girvan-newman</em></a>
479
+ algorithm to split the graph into communities, then clusters on those communities.
480
+ His approach works well for unsupervised clustering of key phrases which have been extracted from many documents.
481
+ In contrast, Nayak was working with entities extracted from "chunks" of text, not with a text graph.
482
+
483
+
484
+ ```python
485
+ render.draw_communities();
486
+ ```
487
+
488
+
489
+
490
+ ![png](ex0_0_files/ex0_0_40_0.png)
491
+
492
+
493
+
494
+ ## graph of relations transform
495
+
496
+ Show a transformed graph, based on _graph of relations_ (see: `lee2023ingram`)
497
+
498
+
499
+ ```python
500
+ graph: textgraphs.GraphOfRelations = textgraphs.GraphOfRelations(
501
+ tg
502
+ )
503
+
504
+ graph.seeds()
505
+ graph.construct_gor()
506
+ ```
507
+
508
+
509
+ ```python
510
+ scores: typing.Dict[ tuple, float ] = graph.get_affinity_scores()
511
+ pv_graph: pyvis.network.Network = graph.render_gor_pyvis(scores)
512
+
513
+ pv_graph.force_atlas_2based(
514
+ gravity = -38,
515
+ central_gravity = 0.01,
516
+ spring_length = 231,
517
+ spring_strength = 0.7,
518
+ damping = 0.8,
519
+ overlap = 0,
520
+ )
521
+
522
+ pv_graph.show_buttons(filter_ = [ "physics" ])
523
+ pv_graph.toggle_physics(True)
524
+
525
+ pv_graph.prep_notebook()
526
+ pv_graph.show("tmp.fig02.html")
527
+ ```
528
+
529
+ tmp.fig02.html
530
+
531
+
532
+
533
+
534
+
535
+
536
+ ![png](ex0_0_files/tmp.fig02.png)
537
+
538
+
539
+
540
+
541
+ *What does this transform provide?*
542
+
543
+ By using a _graph of relations_ dual representation of our graph data, first and foremost we obtain a more compact representation of the relations in the graph, and means of making inferences (e.g., _link prediction_) where there is substantially more invariance in the training data.
544
+
545
+ Also recognize that for a parse graph of a paragraph in the English language, the most interesting nodes will probably be either subjects (`nsubj`) or direct objects (`pobj`). Here in the _graph of relations_ we see illustrated how the important details from _entity linking_ tend to cluster near either `nsubj` or `pobj` entities, connected through punctuation. This is not as readily observed in the earlier visualization of the _lemma graph_.
546
+
547
+ ## extract as RDF triples
548
+
549
+ Extract the nodes and edges which have IRIs, to create an "abstraction layer" as a semantic graph at a higher level of detail above the _lemma graph_:
550
+
551
+
552
+ ```python
553
+ triples: str = tg.export_rdf()
554
+ print(triples)
555
+ ```
556
+
557
+ @base <https://github.com/DerwenAI/textgraphs/ns/> .
558
+ @prefix dbo: <http://dbpedia.org/ontology/> .
559
+ @prefix dbr: <http://dbpedia.org/resource/> .
560
+ @prefix schema: <https://schema.org/> .
561
+ @prefix skos: <http://www.w3.org/2004/02/skos/core#> .
562
+ @prefix wd_ent: <http://www.wikidata.org/entity/> .
563
+
564
+ dbr:Germany skos:definition "Germany (German: Deutschland, German pronunciation: [ˈdɔʏtʃlant]), constitutionally the Federal"@en ;
565
+ skos:prefLabel "Germany"@en .
566
+
567
+ dbr:United_States skos:definition "The United States of America (USA), commonly known as the United States (U.S. or US) or America"@en ;
568
+ skos:prefLabel "United States"@en .
569
+
570
+ dbr:Werner_Herzog skos:definition "Werner Herzog (German: [ˈvɛɐ̯nɐ ˈhɛɐ̯tsoːk]; born 5 September 1942) is a German film director"@en ;
571
+ skos:prefLabel "Werner Herzog"@en .
572
+
573
+ wd_ent:Q183 skos:definition "country in Central Europe"@en ;
574
+ skos:prefLabel "Germany"@en .
575
+
576
+ wd_ent:Q44131 skos:definition "German film director, producer, screenwriter, actor and opera director"@en ;
577
+ skos:prefLabel "Werner Herzog"@en .
578
+
579
+ <entity/america_PROPN> a dbo:Country ;
580
+ skos:prefLabel "America"@en ;
581
+ schema:event <entity/war_NOUN> .
582
+
583
+ <entity/dietrich_PROPN_herzog_PROPN> a dbo:Person ;
584
+ skos:prefLabel "Dietrich Herzog"@en ;
585
+ schema:children <entity/werner_PROPN_herzog_PROPN> .
586
+
587
+ <entity/filmmaker_NOUN> skos:prefLabel "filmmaker"@en .
588
+
589
+ <entity/intellectual_NOUN> skos:prefLabel "intellectual"@en .
590
+
591
+ <entity/son_NOUN> skos:prefLabel "son"@en .
592
+
593
+ <entity/werner_PROPN> a dbo:Person ;
594
+ skos:prefLabel "Werner"@en .
595
+
596
+ <entity/germany_PROPN> a dbo:Country ;
597
+ skos:prefLabel "Germany"@en .
598
+
599
+ <entity/war_NOUN> skos:prefLabel "war"@en .
600
+
601
+ <entity/werner_PROPN_herzog_PROPN> a dbo:Person ;
602
+ skos:prefLabel "Werner Herzog"@en ;
603
+ schema:nationality <entity/germany_PROPN> .
604
+
605
+ dbo:Country skos:definition "Countries, cities, states"@en ;
606
+ skos:prefLabel "country"@en .
607
+
608
+ dbo:Person skos:definition "People, including fictional"@en ;
609
+ skos:prefLabel "person"@en .
610
+
611
+
612
+
613
+
614
+ ## statistical stack profile instrumentation
615
+
616
+
617
+ ```python
618
+ profiler.stop()
619
+ ```
620
+
621
+
622
+
623
+
624
+ <pyinstrument.session.Session at 0x141446080>
625
+
626
+
627
+
628
+
629
+ ```python
630
+ profiler.print()
631
+ ```
632
+
633
+
634
+ _ ._ __/__ _ _ _ _ _/_ Recorded: 17:41:51 Samples: 11163
635
+ /_//_/// /_\ / //_// / //_'/ // Duration: 57.137 CPU time: 72.235
636
+ / _/ v4.6.1
637
+
638
+ Program: /Users/paco/src/textgraphs/venv/lib/python3.10/site-packages/ipykernel_launcher.py -f /Users/paco/Library/Jupyter/runtime/kernel-8ffadb7d-3b45-4e0e-a94f-f098e5ad9fbe.json
639
+
640
+ 57.136 _UnixSelectorEventLoop._run_once asyncio/base_events.py:1832
641
+ └─ 57.135 Handle._run asyncio/events.py:78
642
+ [12 frames hidden] asyncio, ipykernel, IPython
643
+ 41.912 ZMQInteractiveShell.run_ast_nodes IPython/core/interactiveshell.py:3394
644
+ ├─ 20.701 <module> ../ipykernel_5151/1245857438.py:1
645
+ │ └─ 20.701 TextGraphs.perform_entity_linking textgraphs/doc.py:534
646
+ │ └─ 20.701 KGWikiMedia.perform_entity_linking textgraphs/kg.py:306
647
+ │ ├─ 10.790 KGWikiMedia._link_kg_search_entities textgraphs/kg.py:932
648
+ │ │ └─ 10.787 KGWikiMedia.dbpedia_search_entity textgraphs/kg.py:641
649
+ │ │ └─ 10.711 get requests/api.py:62
650
+ │ │ [37 frames hidden] requests, urllib3, http, socket, ssl,...
651
+ │ ├─ 9.143 KGWikiMedia._link_spotlight_entities textgraphs/kg.py:851
652
+ │ │ └─ 9.140 KGWikiMedia.dbpedia_search_entity textgraphs/kg.py:641
653
+ │ │ └─ 9.095 get requests/api.py:62
654
+ │ │ [37 frames hidden] requests, urllib3, http, socket, ssl,...
655
+ │ └─ 0.768 KGWikiMedia._secondary_entity_linking textgraphs/kg.py:1060
656
+ │ └─ 0.768 KGWikiMedia.wikidata_search textgraphs/kg.py:575
657
+ │ └─ 0.765 KGWikiMedia._wikidata_endpoint textgraphs/kg.py:444
658
+ │ └─ 0.765 get requests/api.py:62
659
+ │ [7 frames hidden] requests, urllib3
660
+ └─ 19.514 <module> ../ipykernel_5151/1708547378.py:1
661
+ ├─ 14.502 InferRel_Rebel.__init__ textgraphs/rel.py:121
662
+ │ └─ 14.338 pipeline transformers/pipelines/__init__.py:531
663
+ │ [39 frames hidden] transformers, torch, <built-in>, json
664
+ ├─ 3.437 PipelineFactory.__init__ textgraphs/pipe.py:434
665
+ │ └─ 3.420 load spacy/__init__.py:27
666
+ │ [20 frames hidden] spacy, en_core_web_sm, catalogue, imp...
667
+ ├─ 0.900 InferRel_OpenNRE.__init__ textgraphs/rel.py:33
668
+ │ └─ 0.888 get_model opennre/pretrain.py:126
669
+ └─ 0.672 TextGraphs.create_pipeline textgraphs/doc.py:103
670
+ └─ 0.672 PipelineFactory.create_pipeline textgraphs/pipe.py:508
671
+ └─ 0.672 Pipeline.__init__ textgraphs/pipe.py:216
672
+ └─ 0.672 English.__call__ spacy/language.py:1016
673
+ [11 frames hidden] spacy, spacy_dbpedia_spotlight, reque...
674
+ 14.363 InferRel_Rebel.gen_triples_async textgraphs/pipe.py:188
675
+ ├─ 13.670 InferRel_Rebel.gen_triples textgraphs/rel.py:259
676
+ │ ├─ 12.439 InferRel_Rebel.tokenize_sent textgraphs/rel.py:145
677
+ │ │ └─ 12.436 TranslationPipeline.__call__ transformers/pipelines/text2text_generation.py:341
678
+ │ │ [42 frames hidden] transformers, torch, <built-in>
679
+ │ └─ 1.231 KGWikiMedia.resolve_rel_iri textgraphs/kg.py:370
680
+ │ └─ 0.753 get_entity_dict_from_api qwikidata/linked_data_interface.py:21
681
+ │ [8 frames hidden] qwikidata, requests, urllib3
682
+ └─ 0.693 InferRel_OpenNRE.gen_triples textgraphs/rel.py:58
683
+
684
+
685
+
686
+
687
+ ## outro
688
+
689
+ _\[ more parts are in progress, getting added to this demo \]_
docs/ex0_0_files/ex0_0_17_0.svg ADDED
docs/ex0_0_files/ex0_0_37_0.jpg ADDED
docs/ex0_0_files/ex0_0_37_0.png ADDED
docs/ex0_0_files/ex0_0_39_0.jpg ADDED
docs/ex0_0_files/ex0_0_39_0.png ADDED
docs/ex0_0_files/ex0_0_40_0.png ADDED
docs/ex0_0_files/ex0_0_42_0.png ADDED
docs/ex0_0_files/tmp.fig01.png ADDED
docs/ex0_0_files/tmp.fig02.png ADDED
docs/ex1_0.md ADDED
@@ -0,0 +1,776 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ !!! note
4
+ To run this notebook in JupyterLab, load [`examples/ex1_0.ipynb`](https://github.com/DerwenAI/textgraphs/blob/main/examples/ex1_0.ipynb)
5
+
6
+
7
+
8
+ # reproduce results from the "InGram" paper
9
+
10
+ This is an attempt to reproduce the _graph of relations_ example given in `lee2023ingram`
11
+
12
+ ## environment
13
+
14
+
15
+ ```python
16
+ import os
17
+ import pathlib
18
+ import typing
19
+
20
+ from icecream import ic
21
+ from pyinstrument import Profiler
22
+ import matplotlib.pyplot as plt
23
+ import pandas as pd
24
+ import pyvis
25
+
26
+ import textgraphs
27
+ ```
28
+
29
+
30
+ ```python
31
+ %load_ext watermark
32
+ ```
33
+
34
+
35
+ ```python
36
+ %watermark
37
+ ```
38
+
39
+ Last updated: 2024-01-16T17:35:45.550539-08:00
40
+
41
+ Python implementation: CPython
42
+ Python version : 3.10.11
43
+ IPython version : 8.20.0
44
+
45
+ Compiler : Clang 13.0.0 (clang-1300.0.29.30)
46
+ OS : Darwin
47
+ Release : 21.6.0
48
+ Machine : x86_64
49
+ Processor : i386
50
+ CPU cores : 8
51
+ Architecture: 64bit
52
+
53
+
54
+
55
+
56
+ ```python
57
+ %watermark --iversions
58
+ ```
59
+
60
+ matplotlib: 3.8.2
61
+ pandas : 2.1.4
62
+ pyvis : 0.3.2
63
+ textgraphs: 0.5.0
64
+ sys : 3.10.11 (v3.10.11:7d4cc5aa85, Apr 4 2023, 19:05:19) [Clang 13.0.0 (clang-1300.0.29.30)]
65
+
66
+
67
+
68
+ ## load example graph
69
+
70
+ load from a JSON file which replicates the data for the "Figure 3" example
71
+
72
+
73
+ ```python
74
+ graph: textgraphs.GraphOfRelations = textgraphs.GraphOfRelations(
75
+ textgraphs.SimpleGraph()
76
+ )
77
+
78
+ ingram_path: pathlib.Path = pathlib.Path(os.getcwd()) / "ingram.json"
79
+
80
+ graph.load_ingram(
81
+ ingram_path,
82
+ debug = False,
83
+ )
84
+ ```
85
+
86
+ set up the statistical stack profiling
87
+
88
+
89
+ ```python
90
+ profiler: Profiler = Profiler()
91
+ profiler.start()
92
+ ```
93
+
94
+ ## decouple graph edges into "seeds"
95
+
96
+
97
+ ```python
98
+ graph.seeds(
99
+ debug = True,
100
+ )
101
+ ```
102
+
103
+
104
+ --- triples in source graph ---
105
+
106
+
107
+ ic| edge.src_node: 0, rel_id: 1, edge.dst_node: 1
108
+ ic| edge.src_node: 0, rel_id: 0, edge.dst_node: 2
109
+ ic| edge.src_node: 0, rel_id: 0, edge.dst_node: 3
110
+ ic| edge.src_node: 4, rel_id: 2, edge.dst_node: 2
111
+ ic| edge.src_node: 4, rel_id: 2, edge.dst_node: 3
112
+ ic| edge.src_node: 4, rel_id: 1, edge.dst_node: 5
113
+ ic| edge.src_node: 6, rel_id: 1, edge.dst_node: 5
114
+ ic| edge.src_node: 6, rel_id: 2, edge.dst_node: 7
115
+ ic| edge.src_node: 6, rel_id: 4, edge.dst_node: 8
116
+ ic| edge.src_node: 9,
117
+
118
+ Steven_Spielberg Profession Director
119
+ Steven_Spielberg Directed Catch_Me_If_Can
120
+ Steven_Spielberg Directed Saving_Private_Ryan
121
+ Tom_Hanks ActedIn Catch_Me_If_Can
122
+ Tom_Hanks ActedIn Saving_Private_Ryan
123
+ Tom_Hanks Profession Actor
124
+ Mark_Hamil Profession Actor
125
+ Mark_Hamil ActedIn Star_Wars
126
+ Mark_Hamil BornIn California
127
+
128
+
129
+ rel_id: 5, edge.dst_node: 10
130
+ ic| edge.src_node: 9, rel_id: 4, edge.dst_node: 10
131
+ ic| edge.src_node: 9, rel_id: 3, edge.dst_node: 8
132
+ ic| edge.src_node: 11, rel_id: 4, edge.dst_node: 12
133
+ ic| edge.src_node: 11, rel_id: 3, edge.dst_node: 12
134
+ ic| edge.src_node: 11, rel_id: 3, edge.dst_node: 8
135
+
136
+
137
+ Brad_Pitt Nationality USA
138
+ Brad_Pitt BornIn USA
139
+ Brad_Pitt LivedIn California
140
+ Clint_Eastwood BornIn San_Francisco
141
+ Clint_Eastwood LivedIn San_Francisco
142
+ Clint_Eastwood LivedIn California
143
+
144
+
145
+
146
+ ```python
147
+ graph.trace_source_graph()
148
+ ```
149
+
150
+
151
+ --- nodes in source graph ---
152
+ n: 0, Steven_Spielberg
153
+ head: []
154
+ tail: [(0, 'Profession', 1), (0, 'Directed', 2), (0, 'Directed', 3)]
155
+ n: 1, Director
156
+ head: [(0, 'Profession', 1)]
157
+ tail: []
158
+ n: 2, Catch_Me_If_Can
159
+ head: [(0, 'Directed', 2), (4, 'ActedIn', 2)]
160
+ tail: []
161
+ n: 3, Saving_Private_Ryan
162
+ head: [(0, 'Directed', 3), (4, 'ActedIn', 3)]
163
+ tail: []
164
+ n: 4, Tom_Hanks
165
+ head: []
166
+ tail: [(4, 'ActedIn', 2), (4, 'ActedIn', 3), (4, 'Profession', 5)]
167
+ n: 5, Actor
168
+ head: [(4, 'Profession', 5), (6, 'Profession', 5)]
169
+ tail: []
170
+ n: 6, Mark_Hamil
171
+ head: []
172
+ tail: [(6, 'Profession', 5), (6, 'ActedIn', 7), (6, 'BornIn', 8)]
173
+ n: 7, Star_Wars
174
+ head: [(6, 'ActedIn', 7)]
175
+ tail: []
176
+ n: 8, California
177
+ head: [(6, 'BornIn', 8), (9, 'LivedIn', 8), (11, 'LivedIn', 8)]
178
+ tail: []
179
+ n: 9, Brad_Pitt
180
+ head: []
181
+ tail: [(9, 'Nationality', 10), (9, 'BornIn', 10), (9, 'LivedIn', 8)]
182
+ n: 10, USA
183
+ head: [(9, 'Nationality', 10), (9, 'BornIn', 10)]
184
+ tail: []
185
+ n: 11, Clint_Eastwood
186
+ head: []
187
+ tail: [(11, 'BornIn', 12), (11, 'LivedIn', 12), (11, 'LivedIn', 8)]
188
+ n: 12, San_Francisco
189
+ head: [(11, 'BornIn', 12), (11, 'LivedIn', 12)]
190
+ tail: []
191
+
192
+ --- edges in source graph ---
193
+ e: 0, Directed
194
+ e: 1, Profession
195
+ e: 2, ActedIn
196
+ e: 3, LivedIn
197
+ e: 4, BornIn
198
+ e: 5, Nationality
199
+
200
+
201
+ ## construct a _graph of relations_
202
+
203
+ Transform the graph data into _graph of relations_
204
+
205
+
206
+ ```python
207
+ graph.construct_gor(
208
+ debug = True,
209
+ )
210
+ ```
211
+
212
+ ic| node_id: 0, len(seeds
213
+
214
+
215
+ --- transformed triples ---
216
+
217
+
218
+ ): 3
219
+ ic| trans_arc: TransArc(pair_key=(0, 1),
220
+ a_rel=1,
221
+ b_rel=0,
222
+ node_id=0,
223
+ a_dir=<RelDir.TAIL: 1>,
224
+ b_dir=<RelDir.TAIL: 1>)
225
+ ic| trans_arc: TransArc(pair_key=(0, 1),
226
+ a_rel=1,
227
+ b_rel=0,
228
+ node_id=0,
229
+ a_dir=<RelDir.TAIL: 1>,
230
+ b_dir=<RelDir.TAIL: 1>)
231
+ ic| trans_arc: TransArc(pair_key=(0, 0),
232
+ a_rel=0,
233
+ b_rel=0,
234
+ node_id=0,
235
+ a_dir=<RelDir
236
+
237
+ (0, 1) Profession.tail Steven_Spielberg Directed.tail
238
+
239
+ (0, 1) Profession.tail Steven_Spielberg Directed.tail
240
+
241
+ (0, 0) Directed.tail Steven_Spielberg Directed.tail
242
+
243
+
244
+ .TAIL: 1>,
245
+ b_dir=<RelDir.TAIL: 1>)
246
+ ic| node_id: 1, len(seeds
247
+
248
+
249
+
250
+
251
+ ): 1
252
+ ic| node_id: 2, len(seeds): 2
253
+ ic| trans_arc: TransArc(pair_key=(0, 2),
254
+ a_rel=0,
255
+ b_rel=2,
256
+ node_id=2,
257
+ a_dir=<RelDir.HEAD: 0>,
258
+ b_dir=<
259
+
260
+ (0, 2) Directed.head Catch_Me_If_Can ActedIn.head
261
+
262
+
263
+ RelDir.HEAD: 0>)
264
+ ic| node_id: 3, len(seeds): 2
265
+ ic| trans_arc: TransArc(pair_key=(0, 2),
266
+ a_rel=0,
267
+ b_rel=2,
268
+ node_id=3,
269
+ a_dir=<RelDir.HEAD: 0>,
270
+ b_dir=<RelDir.HEAD: 0>)
271
+ ic| node_id
272
+
273
+
274
+ (0, 2) Directed.head Saving_Private_Ryan ActedIn.head
275
+
276
+
277
+
278
+ : 4, len(seeds): 3
279
+ ic| trans_arc: TransArc(pair_key=(2, 2),
280
+ a_rel=2,
281
+ b_rel=2,
282
+ node_id=4,
283
+ a_dir=<RelDir.TAIL: 1>,
284
+ b_dir=<RelDir.TAIL: 1>)
285
+ ic| trans_arc: TransArc(pair_key=(1, 2),
286
+ a_rel=2,
287
+ b_rel=1,
288
+ node_id=4,
289
+ a_dir=<RelDir.TAIL: 1>,
290
+ b_dir=<RelDir.TAIL: 1>)
291
+ ic| trans_arc: TransArc(pair_key=(1, 2)
292
+
293
+ (2, 2) ActedIn.tail Tom_Hanks ActedIn.tail
294
+
295
+ (1, 2) ActedIn.tail Tom_Hanks Profession.tail
296
+
297
+ (1, 2) ActedIn.tail Tom_Hanks Profession.tail
298
+
299
+
300
+ ,
301
+ a_rel=2,
302
+ b_rel=1,
303
+ node_id=4,
304
+ a_dir=<RelDir.TAIL: 1>,
305
+ b_dir=<RelDir.TAIL: 1>)
306
+ ic|
307
+
308
+
309
+
310
+
311
+ node_id: 5, len(seeds): 2
312
+ ic| trans_arc: TransArc(pair_key=(1, 1),
313
+ a_rel=1,
314
+ b_rel=1,
315
+
316
+
317
+ (1, 1) Profession.head Actor Profession.head
318
+
319
+
320
+ node_id=5,
321
+ a_dir=<RelDir.HEAD: 0>,
322
+ b_dir=<RelDir.HEAD: 0>)
323
+ ic| node_id: 6, len(seeds): 3
324
+ ic| trans_arc: TransArc(pair_key=(1, 2),
325
+ a_rel=1,
326
+ b_rel=2,
327
+ node_id=6,
328
+ a_dir=<RelDir.TAIL: 1>,
329
+ b_dir=<RelDir.TAIL:
330
+
331
+
332
+ (1, 2) Profession.tail Mark_Hamil ActedIn.tail
333
+
334
+
335
+ 1>)
336
+ ic| trans_arc: TransArc(pair_key=(1, 4),
337
+ a_rel=1,
338
+ b_rel=4,
339
+ node_id=6,
340
+ a_dir
341
+
342
+
343
+ (1, 4) Profession.tail Mark_Hamil BornIn.tail
344
+
345
+
346
+ =<RelDir.TAIL: 1>,
347
+ b_dir=<RelDir.TAIL: 1>)
348
+ ic| trans_arc: TransArc(pair_key=(2, 4),
349
+ a_rel=2,
350
+ b_rel=4,
351
+ node_id=6,
352
+
353
+
354
+
355
+ (2, 4) ActedIn.tail Mark_Hamil BornIn.tail
356
+
357
+
358
+ a_dir=<RelDir.TAIL: 1>,
359
+ b_dir=<RelDir.TAIL: 1>)
360
+ ic| node_id: 7, len(seeds): 1
361
+ ic| node_id: 8, len(seeds): 3
362
+ ic| trans_arc: TransArc(pair_key=(3, 4),
363
+ a_rel=4,
364
+ b_rel=3,
365
+ node_id=8,
366
+ a_dir=<RelDir.HEAD: 0>,
367
+ b_dir=<RelDir.HEAD:
368
+
369
+
370
+ (3, 4) BornIn.head California LivedIn.head
371
+
372
+
373
+ 0>)
374
+ ic| trans_arc: TransArc(pair_key=(3, 4),
375
+ a_rel=4,
376
+ b_rel=3,
377
+ node_id=8,
378
+ a_dir=<RelDir.HEAD: 0>,
379
+ b_dir=<RelDir.HEAD: 0>)
380
+ ic| trans_arc: TransArc(pair_key=(3, 3),
381
+ a_rel=3,
382
+ b_rel=3,
383
+ node_id=8,
384
+ a_dir=<RelDir.HEAD: 0>,
385
+ b_dir=<RelDir.HEAD: 0>)
386
+ ic| node_id: 9, len(seeds): 3
387
+ ic
388
+
389
+
390
+ (3, 4) BornIn.head California LivedIn.head
391
+
392
+ (3, 3) LivedIn.head California LivedIn.head
393
+
394
+ (4, 5) Nationality.tail Brad_Pitt BornIn.tail
395
+
396
+
397
+ | trans_arc: TransArc(pair_key=(4, 5),
398
+ a_rel=5,
399
+ b_rel=4,
400
+ node_id=9,
401
+ a_dir=<RelDir.TAIL: 1>,
402
+ b_dir=<RelDir.TAIL: 1>)
403
+ ic| trans_arc: TransArc(pair_key=(3, 5),
404
+ a_rel=5,
405
+ b_rel=3,
406
+ node_id=9,
407
+ a_dir=<RelDir.TAIL: 1>,
408
+ b_dir=<
409
+
410
+
411
+ (3, 5) Nationality.tail Brad_Pitt LivedIn.tail
412
+
413
+
414
+ RelDir.TAIL: 1>)
415
+ ic| trans_arc: TransArc(pair_key=(3, 4),
416
+ a_rel=4,
417
+ b_rel=3,
418
+ node_id=9,
419
+ a_dir=<RelDir.TAIL: 1>,
420
+ b_dir=<RelDir.TAIL: 1>)
421
+ ic| node_id: 10, len(seeds): 2
422
+ ic| trans_arc: TransArc(pair_key=(4, 5),
423
+ a_rel=5,
424
+ b_rel=4,
425
+ node_id=10,
426
+ a_dir=<RelDir.HEAD: 0>,
427
+ b_dir=<RelDir.HEAD: 0>)
428
+ ic| node_id: 11, len(seeds): 3
429
+ ic| trans_arc: TransArc(pair_key=(3,
430
+
431
+
432
+ (3, 4) BornIn.tail Brad_Pitt LivedIn.tail
433
+
434
+ (4, 5) Nationality.head USA BornIn.head
435
+
436
+ (3, 4) BornIn.tail Clint_Eastwood LivedIn.tail
437
+
438
+
439
+ 4),
440
+ a_rel=4,
441
+ b_rel=3,
442
+ node_id=11,
443
+ a_dir=<RelDir.TAIL: 1>,
444
+ b_dir=<RelDir.TAIL: 1>)
445
+ ic
446
+
447
+
448
+ (3, 4) BornIn.tail Clint_Eastwood LivedIn.tail
449
+
450
+
451
+ | trans_arc: TransArc(pair_key=(3, 4),
452
+ a_rel=4,
453
+ b_rel=3,
454
+ node_id=11,
455
+ a_dir=<RelDir.TAIL: 1>,
456
+ b_dir=<RelDir.TAIL: 1>)
457
+ ic| trans_arc: TransArc(pair_key=(3, 3),
458
+ a_rel=3,
459
+ b_rel=3,
460
+ node_id=11,
461
+ a_dir=<RelDir.TAIL: 1>,
462
+ b_dir=<RelDir.TAIL: 1>)
463
+ ic| node_id: 12, len(seeds
464
+
465
+
466
+ (3, 3) LivedIn.tail Clint_Eastwood LivedIn.tail
467
+
468
+
469
+
470
+ ): 2
471
+ ic| trans_arc: TransArc(pair_key=(3, 4),
472
+ a_rel=4,
473
+ b_rel=3,
474
+ node_id=12,
475
+ a_dir=<RelDir.HEAD: 0>,
476
+ b_dir=<RelDir.HEAD: 0>)
477
+
478
+
479
+ (3, 4) BornIn.head San_Francisco LivedIn.head
480
+
481
+
482
+
483
+
484
+ ```python
485
+ scores: typing.Dict[ tuple, float ] = graph.get_affinity_scores(
486
+ debug = True,
487
+ )
488
+ ```
489
+
490
+
491
+ --- collect shared entity tallies ---
492
+ 0 Directed
493
+ h: 4 dict_items([(2, 4.0)])
494
+ t: 6 dict_items([(0, 3.0), (1, 3.0)])
495
+ 1 Profession
496
+ h: 3 dict_items([(1, 3.0)])
497
+ t: 10 dict_items([(0, 3.0), (2, 5.0), (4, 2.0)])
498
+ 2 ActedIn
499
+ h: 4 dict_items([(0, 4.0)])
500
+ t: 10 dict_items([(1, 5.0), (2, 3.0), (4, 2.0)])
501
+ 3 LivedIn
502
+ h: 8 dict_items([(3, 3.0), (4, 5.0)])
503
+ t: 10 dict_items([(3, 3.0), (4, 5.0), (5, 2.0)])
504
+ 4 BornIn
505
+ h: 7 dict_items([(3, 5.0), (5, 2.0)])
506
+ t: 11 dict_items([(1, 2.0), (2, 2.0), (3, 5.0), (5, 2.0)])
507
+ 5 Nationality
508
+ h: 2 dict_items([(4, 2.0)])
509
+ t: 4 dict_items([(3, 2.0), (4, 2.0)])
510
+
511
+
512
+
513
+ ```python
514
+ ic(scores);
515
+ ```
516
+
517
+ ic| scores: {(0, 0): 0.3,
518
+ (0, 1): 0.2653846153846154,
519
+ (0, 2): 0.34285714285714286,
520
+ (1, 1): 0.23076923076923078,
521
+ (1, 2): 0.3708791208791209,
522
+ (1, 4): 0.13247863247863248,
523
+ (2, 2): 0.21428571428571427,
524
+ (2, 4): 0.12698412698412698,
525
+ (3, 3): 0.3333333333333333,
526
+ (3, 4): 0.5555555555555556,
527
+ (3, 5): 0.2222222222222222,
528
+ (4, 5): 0.4444444444444444}
529
+
530
+
531
+ ## visualize the transform results
532
+
533
+
534
+ ```python
535
+ graph.render_gor_plt(scores)
536
+ plt.show()
537
+ ```
538
+
539
+
540
+
541
+ ![png](ex1_0_files/ex1_0_22_0.png)
542
+
543
+
544
+
545
+
546
+ ```python
547
+ pv_graph: pyvis.network.Network = graph.render_gor_pyvis(scores)
548
+
549
+ pv_graph.force_atlas_2based(
550
+ gravity = -38,
551
+ central_gravity = 0.01,
552
+ spring_length = 231,
553
+ spring_strength = 0.7,
554
+ damping = 0.8,
555
+ overlap = 0,
556
+ )
557
+
558
+ pv_graph.show_buttons(filter_ = [ "physics" ])
559
+ pv_graph.toggle_physics(True)
560
+
561
+ pv_graph.prep_notebook()
562
+ pv_graph.show("tmp.fig03.html")
563
+ ```
564
+
565
+ tmp.fig03.html
566
+
567
+
568
+
569
+
570
+
571
+
572
+ ![png](ex1_0_files/tmp.fig03.png)
573
+
574
+
575
+
576
+
577
+ ## analysis
578
+
579
+ As the results below above illustrate, the computed _affinity scores_ differ from what is published in `lee2023ingram`. After trying several different variations of interpretation for the paper's descriptions, the current approach provides the closest approximation that we have obtained.
580
+
581
+
582
+ ```python
583
+ df: pd.DataFrame = graph.trace_metrics(scores)
584
+ df
585
+ ```
586
+
587
+
588
+
589
+
590
+ <div>
591
+ <style scoped>
592
+ .dataframe tbody tr th:only-of-type {
593
+ vertical-align: middle;
594
+ }
595
+
596
+ .dataframe tbody tr th {
597
+ vertical-align: top;
598
+ }
599
+
600
+ .dataframe thead th {
601
+ text-align: right;
602
+ }
603
+ </style>
604
+ <table border="1" class="dataframe">
605
+ <thead>
606
+ <tr style="text-align: right;">
607
+ <th></th>
608
+ <th>pair</th>
609
+ <th>rel_a</th>
610
+ <th>rel_b</th>
611
+ <th>affinity</th>
612
+ <th>expected</th>
613
+ </tr>
614
+ </thead>
615
+ <tbody>
616
+ <tr>
617
+ <th>0</th>
618
+ <td>(0, 0)</td>
619
+ <td>Directed</td>
620
+ <td>Directed</td>
621
+ <td>0.30</td>
622
+ <td>NaN</td>
623
+ </tr>
624
+ <tr>
625
+ <th>1</th>
626
+ <td>(0, 1)</td>
627
+ <td>Directed</td>
628
+ <td>Profession</td>
629
+ <td>0.27</td>
630
+ <td>0.22</td>
631
+ </tr>
632
+ <tr>
633
+ <th>2</th>
634
+ <td>(0, 2)</td>
635
+ <td>Directed</td>
636
+ <td>ActedIn</td>
637
+ <td>0.34</td>
638
+ <td>0.50</td>
639
+ </tr>
640
+ <tr>
641
+ <th>3</th>
642
+ <td>(1, 1)</td>
643
+ <td>Profession</td>
644
+ <td>Profession</td>
645
+ <td>0.23</td>
646
+ <td>NaN</td>
647
+ </tr>
648
+ <tr>
649
+ <th>4</th>
650
+ <td>(1, 2)</td>
651
+ <td>Profession</td>
652
+ <td>ActedIn</td>
653
+ <td>0.37</td>
654
+ <td>0.33</td>
655
+ </tr>
656
+ <tr>
657
+ <th>5</th>
658
+ <td>(1, 4)</td>
659
+ <td>Profession</td>
660
+ <td>BornIn</td>
661
+ <td>0.13</td>
662
+ <td>0.11</td>
663
+ </tr>
664
+ <tr>
665
+ <th>6</th>
666
+ <td>(2, 2)</td>
667
+ <td>ActedIn</td>
668
+ <td>ActedIn</td>
669
+ <td>0.21</td>
670
+ <td>NaN</td>
671
+ </tr>
672
+ <tr>
673
+ <th>7</th>
674
+ <td>(2, 4)</td>
675
+ <td>ActedIn</td>
676
+ <td>BornIn</td>
677
+ <td>0.13</td>
678
+ <td>0.11</td>
679
+ </tr>
680
+ <tr>
681
+ <th>8</th>
682
+ <td>(3, 3)</td>
683
+ <td>LivedIn</td>
684
+ <td>LivedIn</td>
685
+ <td>0.33</td>
686
+ <td>NaN</td>
687
+ </tr>
688
+ <tr>
689
+ <th>9</th>
690
+ <td>(3, 4)</td>
691
+ <td>LivedIn</td>
692
+ <td>BornIn</td>
693
+ <td>0.56</td>
694
+ <td>0.81</td>
695
+ </tr>
696
+ <tr>
697
+ <th>10</th>
698
+ <td>(3, 5)</td>
699
+ <td>LivedIn</td>
700
+ <td>Nationality</td>
701
+ <td>0.22</td>
702
+ <td>0.11</td>
703
+ </tr>
704
+ <tr>
705
+ <th>11</th>
706
+ <td>(4, 5)</td>
707
+ <td>BornIn</td>
708
+ <td>Nationality</td>
709
+ <td>0.44</td>
710
+ <td>0.36</td>
711
+ </tr>
712
+ </tbody>
713
+ </table>
714
+ </div>
715
+
716
+
717
+
718
+ ## statistical stack profile instrumentation
719
+
720
+
721
+ ```python
722
+ profiler.stop()
723
+ ```
724
+
725
+
726
+
727
+
728
+ <pyinstrument.session.Session at 0x1416bc7f0>
729
+
730
+
731
+
732
+
733
+ ```python
734
+ profiler.print()
735
+ ```
736
+
737
+
738
+ _ ._ __/__ _ _ _ _ _/_ Recorded: 17:35:45 Samples: 2526
739
+ /_//_/// /_\ / //_// / //_'/ // Duration: 3.799 CPU time: 4.060
740
+ / _/ v4.6.1
741
+
742
+ Program: /Users/paco/src/textgraphs/venv/lib/python3.10/site-packages/ipykernel_launcher.py -f /Users/paco/Library/Jupyter/runtime/kernel-27f0c564-73f8-45ab-9f64-8b064ae1de10.json
743
+
744
+ 3.799 IPythonKernel.dispatch_queue ipykernel/kernelbase.py:525
745
+ └─ 3.791 IPythonKernel.process_one ipykernel/kernelbase.py:511
746
+ [10 frames hidden] ipykernel, IPython
747
+ 3.680 ZMQInteractiveShell.run_ast_nodes IPython/core/interactiveshell.py:3394
748
+ ├─ 2.176 <module> ../ipykernel_4421/3358887201.py:1
749
+ │ └─ 2.176 GraphOfRelations.construct_gor textgraphs/gor.py:311
750
+ │ ├─ 1.607 IceCreamDebugger.__call__ icecream/icecream.py:204
751
+ │ │ [17 frames hidden] icecream, colorama, ipykernel, thread...
752
+ │ │ 1.078 lock.acquire <built-in>
753
+ │ └─ 0.566 GraphOfRelations._transformed_triples textgraphs/gor.py:275
754
+ │ └─ 0.563 IceCreamDebugger.__call__ icecream/icecream.py:204
755
+ │ [13 frames hidden] icecream, colorama, ipykernel, zmq, t...
756
+ ├─ 0.866 <module> ../ipykernel_4421/4061275008.py:1
757
+ │ └─ 0.866 GraphOfRelations.seeds textgraphs/gor.py:197
758
+ │ └─ 0.865 IceCreamDebugger.__call__ icecream/icecream.py:204
759
+ │ [42 frames hidden] icecream, inspect, posixpath, <built-...
760
+ ├─ 0.362 <module> ../ipykernel_4421/559531165.py:1
761
+ │ ├─ 0.234 show matplotlib/pyplot.py:482
762
+ │ │ [32 frames hidden] matplotlib, matplotlib_inline, IPytho...
763
+ │ └─ 0.128 GraphOfRelations.render_gor_plt textgraphs/gor.py:522
764
+ │ └─ 0.104 draw_networkx networkx/drawing/nx_pylab.py:127
765
+ │ [6 frames hidden] networkx, matplotlib
766
+ ├─ 0.197 <module> ../ipykernel_4421/1169542473.py:1
767
+ │ └─ 0.197 IceCreamDebugger.__call__ icecream/icecream.py:204
768
+ │ [14 frames hidden] icecream, colorama, ipykernel, thread...
769
+ └─ 0.041 <module> ../ipykernel_4421/2247466716.py:1
770
+
771
+
772
+
773
+
774
+ ## outro
775
+
776
+ _\[ more parts are in progress, getting added to this demo \]_
docs/ex1_0_files/ex1_0_22_0.png ADDED