sparrow-ml / rag /agents /instructor /helpers /instructor_helper.py
katanaml's picture
Sparrow Parse
42cd5f6
from sparrow_parse.extractor.unstructured_processor import UnstructuredProcessor
from sparrow_parse.extractor.markdown_processor import MarkdownProcessor
import json
def execute_sparrow_processor(options, file_path, strategy, model_name, local, debug):
content, table_content = None, None
if "unstructured" in options:
processor = UnstructuredProcessor()
content, table_content = processor.extract_data(file_path, strategy, model_name,
['tables', 'unstructured'], local, debug)
elif "markdown" in options:
processor = MarkdownProcessor()
content, table_content = processor.extract_data(file_path, ['tables', 'markdown'], local, debug)
return content, table_content
def merge_dicts(json_str1, json_str2):
# Convert JSON strings to dictionaries
dict1 = json.loads(json_str1)
dict2 = json.loads(json_str2)
merged_dict = dict1.copy()
for key, value in dict2.items():
if key in merged_dict and isinstance(merged_dict[key], list) and isinstance(value, list):
merged_dict[key].extend(value)
else:
merged_dict[key] = value
return merged_dict
def track_query_output(keys, json_data, types):
# Convert JSON string to dictionary
data = json.loads(json_data)
# Initialize the result lists
result = []
result_types = []
# Iterate through each key in the keys array
for i, key in enumerate(keys):
# Check if the key is present in the JSON and has a non-empty value
if key not in data or not data[key].strip():
result.append(key)
result_types.append(types[i])
return result, result_types
def add_answer_page(answer, page_name, answer_page):
if not isinstance(answer, dict):
raise ValueError("The answer should be a dictionary.")
# Parse answer_table if it is a JSON string
if isinstance(answer_page, str):
answer_page = json.loads(answer_page)
answer[page_name] = answer_page
return answer