Spaces:
Sleeping
Sleeping
import streamlit as st | |
import requests | |
import pandas as pd | |
import json | |
import os | |
from datasets import load_dataset | |
# Set page configuration | |
st.set_page_config( | |
page_title="Huggingface Repository Explorer", | |
page_icon="π€", | |
layout="wide", | |
initial_sidebar_state="expanded" | |
) | |
# Title and description | |
st.title("π€ Huggingface Repository Explorer") | |
st.markdown(""" | |
This dashboard showcases our models and datasets on Huggingface. | |
Select a dataset to view sample data. | |
""") | |
# Access token will be set up via environment variable in the Huggingface Space | |
# This way it's not exposed in the code and users don't need to enter it | |
AUTH_TOKEN = os.environ.get("HF_TOKEN", "") | |
# HF API endpoints | |
HF_API_BASE = "https://huggingface.co/api" | |
# Function to fetch dataset samples using the pre-configured token | |
def fetch_dataset_samples(dataset_id, n=10): | |
try: | |
# Load the dataset in streaming mode | |
dataset = load_dataset(dataset_id, | |
split="train", | |
streaming=True, | |
token=AUTH_TOKEN) | |
# Get the first n examples | |
samples = [] | |
for i, example in enumerate(dataset): | |
if i >= n: | |
break | |
samples.append(example) | |
return samples | |
except Exception as e: | |
st.error(f"Error loading dataset samples: {e}") | |
return None | |
# Hard-coded model list | |
model_data = { | |
"Model Name": [ | |
"TitanCAProject/Qwen2.5-Coder-7B-Instruct_lora_r16a32-c_sharp", | |
"TitanCAProject/Qwen2.5-Coder-7B-Instruct_lora_r16a32-python", | |
"TitanCAProject/Qwen2.5-Coder-7B-Instruct_lora_r16a32-C", | |
"TitanCAProject/Qwen2.5-Coder-7B-Instruct_lora_r16a32-java", | |
"TitanCAProject/CodeBERT-javascript" | |
], | |
"Description": [ | |
"Qwen2.5 model for the Csharp language", | |
"Qwen2.5 model for the Python language", | |
"Qwen2.5 model for the C language", | |
"Qwen2.5 model for the Jave language", | |
"CodeBERT model for the Javascript language" | |
], | |
"Size (GB)": [0.4, 0.5, 0.9, 1.3, 0.3], | |
"Last Updated": [ | |
"2024-11-15", | |
"2024-10-30", | |
"2024-12-05", | |
"2024-11-20", | |
"2024-12-10" | |
] | |
} | |
# Convert to DataFrames | |
df_models = pd.DataFrame(model_data) | |
# Function to fetch dataset info including size and sample count | |
def fetch_dataset_info(dataset_id): | |
headers = {"Authorization": f"Bearer {AUTH_TOKEN}"} | |
size_url = f"https://datasets-server.huggingface.co/size?dataset={dataset_id}" | |
url = f"{HF_API_BASE}/datasets/{dataset_id}" | |
try: | |
response = requests.get(size_url, headers=headers) | |
if response.status_code != 200: | |
st.warning(f"Error fetching dataset size info: {response.status_code}") | |
return None | |
dataset_info = response.json() | |
# Get size information - need to calculate | |
size_bytes = dataset_info['size']['dataset'].get('num_bytes_original_files', 0) | |
# Convert to MB for display | |
size_mb = round(size_bytes / (1024 * 1024), 2) if size_bytes else None | |
# Get row count information | |
sample_count = dataset_info['size']['dataset'].get('num_rows', 0) | |
response = requests.get(url, headers=headers) | |
if response.status_code != 200: | |
st.warning(f"Error fetching dataset info: {response.status_code}") | |
return None | |
dataset_info = response.json() | |
result = { | |
'id': dataset_id, | |
'description': dataset_info.get('description', 'No description available'), | |
'size_mb': size_mb, | |
'sample_count': sample_count, | |
'last_modified': dataset_info.get('lastModified', 'Unknown') | |
} | |
return result | |
except Exception as e: | |
st.error(f"Error processing dataset info: {e}") | |
return None | |
# Main tabs | |
tab1, tab2 = st.tabs(["Models", "Datasets"]) | |
# Models Tab | |
with tab1: | |
st.header("Models") | |
# Display models table | |
st.dataframe(df_models, use_container_width=True) | |
# Selected model details | |
st.subheader("Model Details") | |
selected_model = st.selectbox("Select a model for details", df_models["Model Name"], key="model_select") | |
if selected_model: | |
model_details = df_models[df_models["Model Name"] == selected_model].iloc[0] | |
st.markdown("### " + model_details["Model Name"]) | |
st.markdown(f"**Description**: {model_details['Description']}") | |
st.markdown(f"**Size**: {model_details['Size (GB)']} GB") | |
st.markdown(f"**Last Updated**: {model_details['Last Updated']}") | |
with tab2: | |
st.header("Datasets") | |
# List of dataset IDs to display | |
dataset_ids = [ | |
"YChang1112/test-dataset", | |
"Anthropic/EconomicIndex" | |
] | |
# Get actual dataset info from API | |
dataset_info_list = [] | |
if AUTH_TOKEN: | |
with st.spinner("Loading dataset information..."): | |
for dataset_id in dataset_ids: | |
info = fetch_dataset_info(dataset_id) | |
if info: | |
dataset_info_list.append(info) | |
else: | |
st.warning("Authentication token not configured. Unable to fetch dataset information.") | |
# Create a DataFrame from the collected information | |
if dataset_info_list: | |
df_datasets = pd.DataFrame({ | |
"Dataset Name": [info['id'] for info in dataset_info_list], | |
"Description": [info['description'] for info in dataset_info_list], | |
"Size (MB)": [info['size_mb'] for info in dataset_info_list], | |
"Samples": [info['sample_count'] for info in dataset_info_list], | |
"Last Modified": [info['last_modified'] for info in dataset_info_list] | |
}) | |
# Display datasets table | |
st.dataframe(df_datasets, use_container_width=True) | |
else: | |
st.error("No dataset information available. Please check your dataset IDs and authentication token.") | |
# Dataset details with sample preview | |
st.subheader("Dataset Preview") | |
if dataset_info_list: | |
selected_dataset = st.selectbox("Select a dataset to preview", | |
[info['id'] for info in dataset_info_list], | |
key="dataset_select") | |
if selected_dataset: | |
# Find the dataset info | |
dataset_info = next((info for info in dataset_info_list if info['id'] == selected_dataset), None) | |
if dataset_info: | |
st.markdown(f"### {dataset_info['id']}") | |
st.markdown(f"**Description**: {dataset_info['description']}") | |
st.markdown(f"**Size**: {dataset_info['size_mb']} MB") | |
st.markdown(f"**Total Samples**: {dataset_info['sample_count']:,}") | |
st.markdown(f"**Last Modified**: {dataset_info['last_modified']}") | |
# Show dataset samples | |
st.markdown("### Sample Train Data") | |
with st.spinner("Fetching dataset samples..."): | |
samples = fetch_dataset_samples(selected_dataset) | |
if samples: | |
# Convert samples to DataFrame if possible | |
try: | |
# If it's a list of samples | |
if isinstance(samples, list) and len(samples) > 0: | |
# Try to normalize to handle nested structures | |
df_sample = pd.json_normalize(samples) | |
st.dataframe(df_sample, use_container_width=True) | |
# If it's a single sample object | |
elif isinstance(samples, dict): | |
df_sample = pd.DataFrame([samples]) | |
st.dataframe(df_sample, use_container_width=True) | |
else: | |
st.json(samples) | |
except Exception as e: | |
st.error(f"Error displaying samples: {e}") | |
st.json(samples) # Fallback to raw JSON display | |
else: | |
st.warning("Could not fetch dataset samples.") | |
# Footer | |
st.markdown("---") | |
st.markdown("Repository Explorer | Last updated: April 2025") | |