Spaces:
Sleeping
Sleeping
import mlcroissant._src.operation_graph.operations.download as dl_mod | |
import requests | |
import os | |
# Make sure the HF token is loaded | |
HF_TOKEN = os.environ.get("HF_TOKEN") | |
# Set the environment variables Croissant expects | |
os.environ["CROISSANT_BASIC_AUTH_USERNAME"] = "hf_user" | |
os.environ["CROISSANT_BASIC_AUTH_PASSWORD"] = HF_TOKEN or "" | |
print("[DEBUG] HF_TOKEN is", "set" if HF_TOKEN else "missing") | |
print("[DEBUG] Basic auth env set for Croissant") | |
import mlcroissant as mlc | |
import func_timeout | |
import json | |
import traceback | |
WAIT_TIME = 10 * 60 # seconds | |
def validate_json(file_path): | |
"""Validate that the file is proper JSON.""" | |
try: | |
with open(file_path, 'r') as f: | |
json_data = json.load(f) | |
return True, "The file is valid JSON.", json_data | |
except json.JSONDecodeError as e: | |
error_message = f"Invalid JSON format: {str(e)}" | |
return False, error_message, None | |
except Exception as e: | |
error_message = f"Error reading file: {str(e)}" | |
return False, error_message, None | |
def validate_croissant(json_data): | |
"""Validate that the JSON follows Croissant schema.""" | |
try: | |
dataset = mlc.Dataset(jsonld=json_data) | |
return True, "The dataset passes Croissant validation." | |
except mlc.ValidationError as e: | |
error_details = traceback.format_exc() | |
error_message = f"Validation failed: {str(e)}\n\n{error_details}" | |
return False, error_message | |
except Exception as e: | |
error_details = traceback.format_exc() | |
error_message = f"Unexpected error during validation: {str(e)}\n\n{error_details}" | |
return False, error_message | |
def try_generate_record(record_collection): | |
try: | |
for i, record in enumerate(record_collection): | |
if i == 0: | |
break | |
return "success" | |
except Exception as e: | |
return e | |
def validate_records(json_data): | |
"""Validate that records can be generated within the time limit.""" | |
try: | |
dataset = mlc.Dataset(jsonld=json_data) | |
record_sets = dataset.metadata.record_sets | |
if not record_sets: | |
return True, "No record sets found to validate.", "pass" | |
results = [] | |
for record_set in record_sets: | |
try: | |
result = func_timeout.func_timeout( | |
WAIT_TIME, | |
lambda: try_generate_record(dataset.records(record_set=record_set.uuid)) | |
) | |
if isinstance(result, Exception): | |
raise result # re-raise actual error outside timeout | |
results.append(f"Record set '{record_set.uuid}' passed validation.") | |
except func_timeout.exceptions.FunctionTimedOut: | |
error_message = f"Record set '{record_set.uuid}' generation took too long (>10 minutes)." | |
return False, error_message, "warning" | |
except Exception as e: | |
error_details = traceback.format_exc() | |
error_message = ( | |
f"Record set '{record_set.uuid}' failed due to generation error:\n\n" | |
f"```text\n{str(e)}\n\n{error_details}```" | |
) | |
return False, error_message, "warning" | |
return True, "\n".join(results), "pass" | |
except Exception as e: | |
error_details = traceback.format_exc() | |
error_message = f"Unexpected error during records validation: {str(e)}\n\n{error_details}" | |
return False, error_message, "error" | |
def generate_validation_report(filename, json_data, results): | |
"""Generate a detailed validation report in markdown format.""" | |
report = [] | |
report.append("# CROISSANT VALIDATION REPORT") | |
report.append("=" * 80) | |
report.append("## VALIDATION RESULTS") | |
report.append("-" * 80) | |
report.append(f"Starting validation for file: {filename}") | |
# Add validation results | |
for result in results: | |
if len(result) == 4: | |
test_name, passed, message, status = result | |
else: | |
test_name, passed, message = result | |
status = "pass" if passed else "error" | |
report.append(f"### {test_name}") | |
if status == "pass": | |
report.append("✓") | |
elif status == "warning": | |
report.append("?") # Question mark for warning | |
else: | |
report.append("✗") | |
report.append(message.strip()) # Remove any trailing newlines | |
# Add JSON-LD reference | |
report.append("## JSON-LD REFERENCE") | |
report.append("=" * 80) | |
report.append("```json") | |
report.append(json.dumps(json_data, indent=2)) | |
report.append("```") | |
return "\n".join(report) |