File size: 4,722 Bytes
8ed167c
f65aaaf
a8989f9
f65aaaf
a8989f9
f65aaaf
 
a8989f9
 
 
f65aaaf
a8989f9
 
f65aaaf
a8989f9
 
 
 
d5f5654
71ddcd2
d5f5654
 
 
 
 
 
07c18c7
d5f5654
07c18c7
d5f5654
 
07c18c7
d5f5654
 
 
 
 
 
07c18c7
d5f5654
 
07c18c7
d5f5654
 
 
07c18c7
d5f5654
6ec1943
b218e8e
6ec1943
e360100
 
 
6ec1943
 
 
d5f5654
 
 
 
 
 
 
 
a5b79af
d5f5654
 
 
 
 
b218e8e
 
 
 
6ec1943
 
 
 
07c18c7
6ec1943
d5f5654
6ec1943
 
 
d5f5654
 
6ec1943
 
 
 
 
d5f5654
a5b79af
d5f5654
 
07c18c7
a5b79af
07c18c7
 
 
 
 
 
 
 
 
 
 
6ec1943
 
 
 
 
 
 
07c18c7
6ec1943
 
 
 
 
 
07c18c7
 
 
 
 
 
 
 
 
6ec1943
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import mlcroissant._src.operation_graph.operations.download as dl_mod
import requests
import os

# Make sure the HF token is loaded
HF_TOKEN = os.environ.get("HF_TOKEN")

# Set the environment variables Croissant expects
os.environ["CROISSANT_BASIC_AUTH_USERNAME"] = "hf_user"
os.environ["CROISSANT_BASIC_AUTH_PASSWORD"] = HF_TOKEN or ""

print("[DEBUG] HF_TOKEN is", "set" if HF_TOKEN else "missing")
print("[DEBUG] Basic auth env set for Croissant")

import mlcroissant as mlc
import func_timeout
import json
import traceback

WAIT_TIME = 10 * 60  # seconds

def validate_json(file_path):
    """Validate that the file is proper JSON."""
    try:
        with open(file_path, 'r') as f:
            json_data = json.load(f)
        return True, "The file is valid JSON.", json_data
    except json.JSONDecodeError as e:
        error_message = f"Invalid JSON format: {str(e)}"
        return False, error_message, None
    except Exception as e:
        error_message = f"Error reading file: {str(e)}"
        return False, error_message, None

def validate_croissant(json_data):
    """Validate that the JSON follows Croissant schema."""
    try:
        dataset = mlc.Dataset(jsonld=json_data)
        return True, "The dataset passes Croissant validation."
    except mlc.ValidationError as e:
        error_details = traceback.format_exc()
        error_message = f"Validation failed: {str(e)}\n\n{error_details}"
        return False, error_message
    except Exception as e:
        error_details = traceback.format_exc()
        error_message = f"Unexpected error during validation: {str(e)}\n\n{error_details}"
        return False, error_message
    
def try_generate_record(record_collection):
    try:
        for i, record in enumerate(record_collection):
            if i == 0:
                break
        return "success"
    except Exception as e:
        return e

def validate_records(json_data):
    """Validate that records can be generated within the time limit."""
    try:
        dataset = mlc.Dataset(jsonld=json_data)
        record_sets = dataset.metadata.record_sets
        
        if not record_sets:
            return True, "No record sets found to validate.", "pass"
        
        results = []
        
        for record_set in record_sets:
            try:
                result = func_timeout.func_timeout(
                    WAIT_TIME,
                    lambda: try_generate_record(dataset.records(record_set=record_set.uuid))
                )

                if isinstance(result, Exception):
                    raise result  # re-raise actual error outside timeout

                results.append(f"Record set '{record_set.uuid}' passed validation.")

            except func_timeout.exceptions.FunctionTimedOut:
                error_message = f"Record set '{record_set.uuid}' generation took too long (>10 minutes)."
                return False, error_message, "warning"

            except Exception as e:
                error_details = traceback.format_exc()
                error_message = (
                    f"Record set '{record_set.uuid}' failed due to generation error:\n\n"
                    f"```text\n{str(e)}\n\n{error_details}```"
                )
                return False, error_message, "warning"
        
        return True, "\n".join(results), "pass"
    except Exception as e:
        error_details = traceback.format_exc()
        error_message = f"Unexpected error during records validation: {str(e)}\n\n{error_details}"
        return False, error_message, "error"
    
def generate_validation_report(filename, json_data, results):
    """Generate a detailed validation report in markdown format."""
    report = []
    report.append("# CROISSANT VALIDATION REPORT")
    report.append("=" * 80)
    report.append("## VALIDATION RESULTS")
    report.append("-" * 80)
    report.append(f"Starting validation for file: {filename}")

    # Add validation results
    for result in results:
        if len(result) == 4:
            test_name, passed, message, status = result
        else:
            test_name, passed, message = result
            status = "pass" if passed else "error"

        report.append(f"### {test_name}")
        if status == "pass":
            report.append("✓")
        elif status == "warning":
            report.append("?")  # Question mark for warning
        else:
            report.append("✗")
        report.append(message.strip())  # Remove any trailing newlines

    # Add JSON-LD reference
    report.append("## JSON-LD REFERENCE")
    report.append("=" * 80)
    report.append("```json")
    report.append(json.dumps(json_data, indent=2))
    report.append("```")

    return "\n".join(report)