Spaces:

katanaml
/

sparrow-ml

Running

File size: 2,066 Bytes

42cd5f6

from sparrow_parse.extractor.unstructured_processor import UnstructuredProcessor
from sparrow_parse.extractor.markdown_processor import MarkdownProcessor
import json


def execute_sparrow_processor(options, file_path, strategy, model_name, local, debug):
    content, table_content = None, None
    if "unstructured" in options:
        processor = UnstructuredProcessor()
        content, table_content = processor.extract_data(file_path, strategy, model_name,
                                                        ['tables', 'unstructured'], local, debug)
    elif "markdown" in options:
        processor = MarkdownProcessor()
        content, table_content = processor.extract_data(file_path, ['tables', 'markdown'], local, debug)

    return content, table_content


def merge_dicts(json_str1, json_str2):
    # Convert JSON strings to dictionaries
    dict1 = json.loads(json_str1)
    dict2 = json.loads(json_str2)

    merged_dict = dict1.copy()
    for key, value in dict2.items():
        if key in merged_dict and isinstance(merged_dict[key], list) and isinstance(value, list):
            merged_dict[key].extend(value)
        else:
            merged_dict[key] = value
    return merged_dict


def track_query_output(keys, json_data, types):
    # Convert JSON string to dictionary
    data = json.loads(json_data)

    # Initialize the result lists
    result = []
    result_types = []

    # Iterate through each key in the keys array
    for i, key in enumerate(keys):
        # Check if the key is present in the JSON and has a non-empty value
        if key not in data or not data[key].strip():
            result.append(key)
            result_types.append(types[i])

    return result, result_types


def add_answer_page(answer, page_name, answer_page):
    if not isinstance(answer, dict):
        raise ValueError("The answer should be a dictionary.")

    # Parse answer_table if it is a JSON string
    if isinstance(answer_page, str):
        answer_page = json.loads(answer_page)

    answer[page_name] = answer_page
    return answer