Spaces:
Sleeping
Sleeping
File size: 4,722 Bytes
8ed167c f65aaaf a8989f9 f65aaaf a8989f9 f65aaaf a8989f9 f65aaaf a8989f9 f65aaaf a8989f9 d5f5654 71ddcd2 d5f5654 07c18c7 d5f5654 07c18c7 d5f5654 07c18c7 d5f5654 07c18c7 d5f5654 07c18c7 d5f5654 07c18c7 d5f5654 6ec1943 b218e8e 6ec1943 e360100 6ec1943 d5f5654 a5b79af d5f5654 b218e8e 6ec1943 07c18c7 6ec1943 d5f5654 6ec1943 d5f5654 6ec1943 d5f5654 a5b79af d5f5654 07c18c7 a5b79af 07c18c7 6ec1943 07c18c7 6ec1943 07c18c7 6ec1943 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import mlcroissant._src.operation_graph.operations.download as dl_mod
import requests
import os
# Make sure the HF token is loaded
HF_TOKEN = os.environ.get("HF_TOKEN")
# Set the environment variables Croissant expects
os.environ["CROISSANT_BASIC_AUTH_USERNAME"] = "hf_user"
os.environ["CROISSANT_BASIC_AUTH_PASSWORD"] = HF_TOKEN or ""
print("[DEBUG] HF_TOKEN is", "set" if HF_TOKEN else "missing")
print("[DEBUG] Basic auth env set for Croissant")
import mlcroissant as mlc
import func_timeout
import json
import traceback
WAIT_TIME = 10 * 60 # seconds
def validate_json(file_path):
"""Validate that the file is proper JSON."""
try:
with open(file_path, 'r') as f:
json_data = json.load(f)
return True, "The file is valid JSON.", json_data
except json.JSONDecodeError as e:
error_message = f"Invalid JSON format: {str(e)}"
return False, error_message, None
except Exception as e:
error_message = f"Error reading file: {str(e)}"
return False, error_message, None
def validate_croissant(json_data):
"""Validate that the JSON follows Croissant schema."""
try:
dataset = mlc.Dataset(jsonld=json_data)
return True, "The dataset passes Croissant validation."
except mlc.ValidationError as e:
error_details = traceback.format_exc()
error_message = f"Validation failed: {str(e)}\n\n{error_details}"
return False, error_message
except Exception as e:
error_details = traceback.format_exc()
error_message = f"Unexpected error during validation: {str(e)}\n\n{error_details}"
return False, error_message
def try_generate_record(record_collection):
try:
for i, record in enumerate(record_collection):
if i == 0:
break
return "success"
except Exception as e:
return e
def validate_records(json_data):
"""Validate that records can be generated within the time limit."""
try:
dataset = mlc.Dataset(jsonld=json_data)
record_sets = dataset.metadata.record_sets
if not record_sets:
return True, "No record sets found to validate.", "pass"
results = []
for record_set in record_sets:
try:
result = func_timeout.func_timeout(
WAIT_TIME,
lambda: try_generate_record(dataset.records(record_set=record_set.uuid))
)
if isinstance(result, Exception):
raise result # re-raise actual error outside timeout
results.append(f"Record set '{record_set.uuid}' passed validation.")
except func_timeout.exceptions.FunctionTimedOut:
error_message = f"Record set '{record_set.uuid}' generation took too long (>10 minutes)."
return False, error_message, "warning"
except Exception as e:
error_details = traceback.format_exc()
error_message = (
f"Record set '{record_set.uuid}' failed due to generation error:\n\n"
f"```text\n{str(e)}\n\n{error_details}```"
)
return False, error_message, "warning"
return True, "\n".join(results), "pass"
except Exception as e:
error_details = traceback.format_exc()
error_message = f"Unexpected error during records validation: {str(e)}\n\n{error_details}"
return False, error_message, "error"
def generate_validation_report(filename, json_data, results):
"""Generate a detailed validation report in markdown format."""
report = []
report.append("# CROISSANT VALIDATION REPORT")
report.append("=" * 80)
report.append("## VALIDATION RESULTS")
report.append("-" * 80)
report.append(f"Starting validation for file: {filename}")
# Add validation results
for result in results:
if len(result) == 4:
test_name, passed, message, status = result
else:
test_name, passed, message = result
status = "pass" if passed else "error"
report.append(f"### {test_name}")
if status == "pass":
report.append("✓")
elif status == "warning":
report.append("?") # Question mark for warning
else:
report.append("✗")
report.append(message.strip()) # Remove any trailing newlines
# Add JSON-LD reference
report.append("## JSON-LD REFERENCE")
report.append("=" * 80)
report.append("```json")
report.append(json.dumps(json_data, indent=2))
report.append("```")
return "\n".join(report) |