Spaces:

GotoUsuke
/

GraphRag

Running

App Files Files Community

GraphRag / graphrag-ollama /lib /python3.12 /site-packages /_plotly_utils /utils.py

GotoUsuke

Upload folder using huggingface_hub

db4a26f verified 7 months ago

raw

history blame contribute delete

14.3 kB

	import decimal
	import json as _json
	import sys
	import re
	from functools import reduce

	from _plotly_utils.optional_imports import get_module
	from _plotly_utils.basevalidators import ImageUriValidator


	def cumsum(x):
	"""
	Custom cumsum to avoid a numpy import.
	"""

	def _reducer(a, x):
	if len(a) == 0:
	return [x]
	return a + [a[-1] + x]

	ret = reduce(_reducer, x, [])
	return ret


	class PlotlyJSONEncoder(_json.JSONEncoder):
	"""
	Meant to be passed as the `cls` kwarg to json.dumps(obj, cls=..)

	See PlotlyJSONEncoder.default for more implementation information.

	Additionally, this encoder overrides nan functionality so that 'Inf',
	'NaN' and '-Inf' encode to 'null'. Which is stricter JSON than the Python
	version.

	"""

	def coerce_to_strict(self, const):
	"""
	This is used to ultimately encode into strict JSON, see `encode`

	"""
	# before python 2.7, 'true', 'false', 'null', were include here.
	if const in ("Infinity", "-Infinity", "NaN"):
	return None
	else:
	return const

	def encode(self, o):
	"""
	Load and then dump the result using parse_constant kwarg

	Note that setting invalid separators will cause a failure at this step.

	"""
	# this will raise errors in a normal-expected way
	encoded_o = super(PlotlyJSONEncoder, self).encode(o)
	# Brute force guessing whether NaN or Infinity values are in the string
	# We catch false positive cases (e.g. strings such as titles, labels etc.)
	# but this is ok since the intention is to skip the decoding / reencoding
	# step when it's completely safe

	if not ("NaN" in encoded_o or "Infinity" in encoded_o):
	return encoded_o

	# now:
	# 1. `loads` to switch Infinity, -Infinity, NaN to None
	# 2. `dumps` again so you get 'null' instead of extended JSON
	try:
	new_o = _json.loads(encoded_o, parse_constant=self.coerce_to_strict)
	except ValueError:

	# invalid separators will fail here. raise a helpful exception
	raise ValueError(
	"Encoding into strict JSON failed. Did you set the separators "
	"valid JSON separators?"
	)
	else:
	return _json.dumps(
	new_o,
	sort_keys=self.sort_keys,
	indent=self.indent,
	separators=(self.item_separator, self.key_separator),
	)

	def default(self, obj):
	"""
	Accept an object (of unknown type) and try to encode with priority:
	1. builtin: user-defined objects
	2. sage: sage math cloud
	3. pandas: dataframes/series
	4. numpy: ndarrays
	5. datetime: time/datetime objects

	Each method throws a NotEncoded exception if it fails.

	The default method will only get hit if the object is not a type that
	is naturally encoded by json:

	Normal objects:
	dict object
	list, tuple array
	str, unicode string
	int, long, float number
	True true
	False false
	None null

	Extended objects:
	float('nan') 'NaN'
	float('infinity') 'Infinity'
	float('-infinity') '-Infinity'

	Therefore, we only anticipate either unknown iterables or values here.

	"""
	# TODO: The ordering if these methods is very important. Is this OK?
	encoding_methods = (
	self.encode_as_plotly,
	self.encode_as_sage,
	self.encode_as_numpy,
	self.encode_as_pandas,
	self.encode_as_datetime,
	self.encode_as_date,
	self.encode_as_list, # because some values have `tolist` do last.
	self.encode_as_decimal,
	self.encode_as_pil,
	)
	for encoding_method in encoding_methods:
	try:
	return encoding_method(obj)
	except NotEncodable:
	pass
	return _json.JSONEncoder.default(self, obj)

	@staticmethod
	def encode_as_plotly(obj):
	"""Attempt to use a builtin `to_plotly_json` method."""
	try:
	return obj.to_plotly_json()
	except AttributeError:
	raise NotEncodable

	@staticmethod
	def encode_as_list(obj):
	"""Attempt to use `tolist` method to convert to normal Python list."""
	if hasattr(obj, "tolist"):
	return obj.tolist()
	else:
	raise NotEncodable

	@staticmethod
	def encode_as_sage(obj):
	"""Attempt to convert sage.all.RR to floats and sage.all.ZZ to ints"""
	sage_all = get_module("sage.all")
	if not sage_all:
	raise NotEncodable

	if obj in sage_all.RR:
	return float(obj)
	elif obj in sage_all.ZZ:
	return int(obj)
	else:
	raise NotEncodable

	@staticmethod
	def encode_as_pandas(obj):
	"""Attempt to convert pandas.NaT / pandas.NA"""
	pandas = get_module("pandas", should_load=False)
	if not pandas:
	raise NotEncodable

	if obj is pandas.NaT:
	return None

	# pandas.NA was introduced in pandas 1.0
	if hasattr(pandas, "NA") and obj is pandas.NA:
	return None

	raise NotEncodable

	@staticmethod
	def encode_as_numpy(obj):
	"""Attempt to convert numpy.ma.core.masked"""
	numpy = get_module("numpy", should_load=False)
	if not numpy:
	raise NotEncodable

	if obj is numpy.ma.core.masked:
	return float("nan")
	elif isinstance(obj, numpy.ndarray) and obj.dtype.kind == "M":
	try:
	return numpy.datetime_as_string(obj).tolist()
	except TypeError:
	pass

	raise NotEncodable

	@staticmethod
	def encode_as_datetime(obj):
	"""Convert datetime objects to iso-format strings"""
	try:
	return obj.isoformat()
	except AttributeError:
	raise NotEncodable

	@staticmethod
	def encode_as_date(obj):
	"""Attempt to convert to utc-iso time string using date methods."""
	try:
	time_string = obj.isoformat()
	except AttributeError:
	raise NotEncodable
	else:
	return iso_to_plotly_time_string(time_string)

	@staticmethod
	def encode_as_decimal(obj):
	"""Attempt to encode decimal by converting it to float"""
	if isinstance(obj, decimal.Decimal):
	return float(obj)
	else:
	raise NotEncodable

	@staticmethod
	def encode_as_pil(obj):
	"""Attempt to convert PIL.Image.Image to base64 data uri"""
	image = get_module("PIL.Image")
	if image is not None and isinstance(obj, image.Image):
	return ImageUriValidator.pil_image_to_uri(obj)
	else:
	raise NotEncodable


	class NotEncodable(Exception):
	pass


	def iso_to_plotly_time_string(iso_string):
	"""Remove timezone info and replace 'T' delimeter with ' ' (ws)."""
	# make sure we don't send timezone info to plotly
	if (iso_string.split("-")[:3] == "00:00") or (iso_string.split("+")[0] == "00:00"):
	raise Exception(
	"Plotly won't accept timestrings with timezone info.\n"
	"All timestrings are assumed to be in UTC."
	)

	iso_string = iso_string.replace("-00:00", "").replace("+00:00", "")

	if iso_string.endswith("T00:00:00"):
	return iso_string.replace("T00:00:00", "")
	else:
	return iso_string.replace("T", " ")


	def template_doc(**names):
	def _decorator(func):
	if not sys.version_info[:2] == (3, 2):
	if func.__doc__ is not None:
	func.__doc__ = func.__doc__.format(**names)
	return func

	return _decorator


	def _natural_sort_strings(vals, reverse=False):
	def key(v):
	v_parts = re.split(r"(\d+)", v)
	for i in range(len(v_parts)):
	try:
	v_parts[i] = int(v_parts[i])
	except ValueError:
	# not an int
	pass
	return tuple(v_parts)

	return sorted(vals, key=key, reverse=reverse)


	def _get_int_type():
	np = get_module("numpy", should_load=False)
	if np:
	int_type = (int, np.integer)
	else:
	int_type = (int,)
	return int_type


	def split_multichar(ss, chars):
	"""
	Split all the strings in ss at any of the characters in chars.
	Example:

	>>> ss = ["a.string[0].with_separators"]
	>>> chars = list(".[]_")
	>>> split_multichar(ss, chars)
	['a', 'string', '0', '', 'with', 'separators']

	:param (list) ss: A list of strings.
	:param (list) chars: Is a list of chars (note: not a string).
	"""
	if len(chars) == 0:
	return ss
	c = chars.pop()
	ss = reduce(lambda x, y: x + y, map(lambda x: x.split(c), ss))
	return split_multichar(ss, chars)


	def split_string_positions(ss):
	"""
	Given a list of strings split using split_multichar, return a list of
	integers representing the indices of the first character of every string in
	the original string.
	Example:

	>>> ss = ["a.string[0].with_separators"]
	>>> chars = list(".[]_")
	>>> ss_split = split_multichar(ss, chars)
	>>> ss_split
	['a', 'string', '0', '', 'with', 'separators']
	>>> split_string_positions(ss_split)
	[0, 2, 9, 11, 12, 17]

	:param (list) ss: A list of strings.
	"""
	return list(
	map(
	lambda t: t[0] + t[1],
	zip(range(len(ss)), cumsum([0] + list(map(len, ss[:-1])))),
	)
	)


	def display_string_positions(p, i=None, offset=0, length=1, char="^", trim=True):
	"""
	Return a string that is whitespace except at p[i] which is replaced with char.
	If i is None then all the indices of the string in p are replaced with char.

	Example:

	>>> ss = ["a.string[0].with_separators"]
	>>> chars = list(".[]_")
	>>> ss_split = split_multichar(ss, chars)
	>>> ss_split
	['a', 'string', '0', '', 'with', 'separators']
	>>> ss_pos = split_string_positions(ss_split)
	>>> ss[0]
	'a.string[0].with_separators'
	>>> display_string_positions(ss_pos,4)
	' ^'
	>>> display_string_positions(ss_pos,4,offset=1,length=3,char="~",trim=False)
	' ~~~ '
	>>> display_string_positions(ss_pos)
	'^ ^ ^ ^^ ^'
	:param (list) p: A list of integers.
	:param (integer\|None) i: Optional index of p to display.
	:param (integer) offset: Allows adding a number of spaces to the replacement.
	:param (integer) length: Allows adding a replacement that is the char
	repeated length times.
	:param (str) char: allows customizing the replacement character.
	:param (boolean) trim: trims the remaining whitespace if True.
	"""
	s = [" " for _ in range(max(p) + 1 + offset + length)]
	maxaddr = 0
	if i is None:
	for p_ in p:
	for l in range(length):
	maxaddr = p_ + offset + l
	s[maxaddr] = char
	else:
	for l in range(length):
	maxaddr = p[i] + offset + l
	s[maxaddr] = char
	ret = "".join(s)
	if trim:
	ret = ret[: maxaddr + 1]
	return ret


	def chomp_empty_strings(strings, c, reverse=False):
	"""
	Given a list of strings, some of which are the empty string "", replace the
	empty strings with c and combine them with the closest non-empty string on
	the left or "" if it is the first string.
	Examples:
	for c="_"
	['hey', '', 'why', '', '', 'whoa', '', ''] -> ['hey_', 'why__', 'whoa__']
	['', 'hi', '', "I'm", 'bob', '', ''] -> ['_', 'hi_', "I'm", 'bob__']
	['hi', "i'm", 'a', 'good', 'string'] -> ['hi', "i'm", 'a', 'good', 'string']
	Some special cases are:
	[] -> []
	[''] -> ['']
	['', ''] -> ['_']
	['', '', '', ''] -> ['___']
	If reverse is true, empty strings are combined with closest non-empty string
	on the right or "" if it is the last string.
	"""

	def _rev(l):
	return [s[::-1] for s in l][::-1]

	if reverse:
	return _rev(chomp_empty_strings(_rev(strings), c))
	if not len(strings):
	return strings
	if sum(map(len, strings)) == 0:
	return [c * (len(strings) - 1)]

	class _Chomper:
	def __init__(self, c):
	self.c = c

	def __call__(self, x, y):
	# x is list up to now
	# y is next item in list
	# x should be [""] initially, and then empty strings filtered out at the
	# end
	if len(y) == 0:
	return x[:-1] + [x[-1] + self.c]
	else:
	return x + [y]

	return list(filter(len, reduce(_Chomper(c), strings, [""])))


	# taken from
	# https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Python
	def levenshtein(s1, s2):
	if len(s1) < len(s2):
	return levenshtein(s2, s1) # len(s1) >= len(s2)
	if len(s2) == 0:
	return len(s1)
	previous_row = range(len(s2) + 1)
	for i, c1 in enumerate(s1):
	current_row = [i + 1]
	for j, c2 in enumerate(s2):
	# j+1 instead of j since previous_row and current_row are one character longer
	# than s2
	insertions = previous_row[j + 1] + 1
	deletions = current_row[j] + 1
	substitutions = previous_row[j] + (c1 != c2)
	current_row.append(min(insertions, deletions, substitutions))
	previous_row = current_row
	return previous_row[-1]


	def find_closest_string(string, strings):
	def _key(s):
	# sort by levenshtein distance and lexographically to maintain a stable
	# sort for different keys with the same levenshtein distance
	return (levenshtein(s, string), s)

	return sorted(strings, key=_key)[0]