Spaces:
Sleeping
Sleeping
from sparrow_parse.extractor.unstructured_processor import UnstructuredProcessor | |
from sparrow_parse.extractor.markdown_processor import MarkdownProcessor | |
import json | |
def execute_sparrow_processor(options, file_path, strategy, model_name, local, debug): | |
content, table_content = None, None | |
if "unstructured" in options: | |
processor = UnstructuredProcessor() | |
content, table_content = processor.extract_data(file_path, strategy, model_name, | |
['tables', 'unstructured'], local, debug) | |
elif "markdown" in options: | |
processor = MarkdownProcessor() | |
content, table_content = processor.extract_data(file_path, ['tables', 'markdown'], local, debug) | |
return content, table_content | |
def merge_dicts(json_str1, json_str2): | |
# Convert JSON strings to dictionaries | |
dict1 = json.loads(json_str1) | |
dict2 = json.loads(json_str2) | |
merged_dict = dict1.copy() | |
for key, value in dict2.items(): | |
if key in merged_dict and isinstance(merged_dict[key], list) and isinstance(value, list): | |
merged_dict[key].extend(value) | |
else: | |
merged_dict[key] = value | |
return merged_dict | |
def track_query_output(keys, json_data, types): | |
# Convert JSON string to dictionary | |
data = json.loads(json_data) | |
# Initialize the result lists | |
result = [] | |
result_types = [] | |
# Iterate through each key in the keys array | |
for i, key in enumerate(keys): | |
# Check if the key is present in the JSON and has a non-empty value | |
if key not in data or not data[key].strip(): | |
result.append(key) | |
result_types.append(types[i]) | |
return result, result_types | |
def add_answer_page(answer, page_name, answer_page): | |
if not isinstance(answer, dict): | |
raise ValueError("The answer should be a dictionary.") | |
# Parse answer_table if it is a JSON string | |
if isinstance(answer_page, str): | |
answer_page = json.loads(answer_page) | |
answer[page_name] = answer_page | |
return answer | |