Spaces:

YChang1112
/

Titan_Project_Dashboard

Sleeping

App Files Files Community

Titan_Project_Dashboard / app.py

YChang1112

Initial prototype

f5fb58c verified 2 months ago

raw

history blame contribute delete

8.51 kB

	import streamlit as st
	import requests
	import pandas as pd
	import json
	import os
	from datasets import load_dataset

	# Set page configuration
	st.set_page_config(
	page_title="Huggingface Repository Explorer",
	page_icon="🤗",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	# Title and description
	st.title("🤗 Huggingface Repository Explorer")
	st.markdown("""
	This dashboard showcases our models and datasets on Huggingface.
	Select a dataset to view sample data.
	""")

	# Access token will be set up via environment variable in the Huggingface Space
	# This way it's not exposed in the code and users don't need to enter it
	AUTH_TOKEN = os.environ.get("HF_TOKEN", "")

	# HF API endpoints
	HF_API_BASE = "https://huggingface.co/api"

	# Function to fetch dataset samples using the pre-configured token
	def fetch_dataset_samples(dataset_id, n=10):
	try:
	# Load the dataset in streaming mode
	dataset = load_dataset(dataset_id,
	split="train",
	streaming=True,
	token=AUTH_TOKEN)

	# Get the first n examples
	samples = []
	for i, example in enumerate(dataset):
	if i >= n:
	break
	samples.append(example)

	return samples
	except Exception as e:
	st.error(f"Error loading dataset samples: {e}")
	return None

	# Hard-coded model list
	model_data = {
	"Model Name": [
	"TitanCAProject/Qwen2.5-Coder-7B-Instruct_lora_r16a32-c_sharp",
	"TitanCAProject/Qwen2.5-Coder-7B-Instruct_lora_r16a32-python",
	"TitanCAProject/Qwen2.5-Coder-7B-Instruct_lora_r16a32-C",
	"TitanCAProject/Qwen2.5-Coder-7B-Instruct_lora_r16a32-java",
	"TitanCAProject/CodeBERT-javascript"
	],
	"Description": [
	"Qwen2.5 model for the Csharp language",
	"Qwen2.5 model for the Python language",
	"Qwen2.5 model for the C language",
	"Qwen2.5 model for the Jave language",
	"CodeBERT model for the Javascript language"
	],
	"Size (GB)": [0.4, 0.5, 0.9, 1.3, 0.3],
	"Last Updated": [
	"2024-11-15",
	"2024-10-30",
	"2024-12-05",
	"2024-11-20",
	"2024-12-10"
	]
	}

	# Convert to DataFrames
	df_models = pd.DataFrame(model_data)

	# Function to fetch dataset info including size and sample count
	def fetch_dataset_info(dataset_id):
	headers = {"Authorization": f"Bearer {AUTH_TOKEN}"}
	size_url = f"https://datasets-server.huggingface.co/size?dataset={dataset_id}"
	url = f"{HF_API_BASE}/datasets/{dataset_id}"

	try:
	response = requests.get(size_url, headers=headers)
	if response.status_code != 200:
	st.warning(f"Error fetching dataset size info: {response.status_code}")
	return None
	dataset_info = response.json()

	# Get size information - need to calculate
	size_bytes = dataset_info['size']['dataset'].get('num_bytes_original_files', 0)
	# Convert to MB for display
	size_mb = round(size_bytes / (1024 * 1024), 2) if size_bytes else None

	# Get row count information
	sample_count = dataset_info['size']['dataset'].get('num_rows', 0)

	response = requests.get(url, headers=headers)
	if response.status_code != 200:
	st.warning(f"Error fetching dataset info: {response.status_code}")
	return None
	dataset_info = response.json()

	result = {
	'id': dataset_id,
	'description': dataset_info.get('description', 'No description available'),
	'size_mb': size_mb,
	'sample_count': sample_count,
	'last_modified': dataset_info.get('lastModified', 'Unknown')
	}
	return result

	except Exception as e:
	st.error(f"Error processing dataset info: {e}")
	return None

	# Main tabs
	tab1, tab2 = st.tabs(["Models", "Datasets"])

	# Models Tab
	with tab1:
	st.header("Models")

	# Display models table
	st.dataframe(df_models, use_container_width=True)

	# Selected model details
	st.subheader("Model Details")
	selected_model = st.selectbox("Select a model for details", df_models["Model Name"], key="model_select")

	if selected_model:
	model_details = df_models[df_models["Model Name"] == selected_model].iloc[0]

	st.markdown("### " + model_details["Model Name"])
	st.markdown(f"Description: {model_details['Description']}")
	st.markdown(f"Size: {model_details['Size (GB)']} GB")
	st.markdown(f"Last Updated: {model_details['Last Updated']}")


	with tab2:
	st.header("Datasets")

	# List of dataset IDs to display
	dataset_ids = [
	"YChang1112/test-dataset",
	"Anthropic/EconomicIndex"
	]

	# Get actual dataset info from API
	dataset_info_list = []
	if AUTH_TOKEN:
	with st.spinner("Loading dataset information..."):
	for dataset_id in dataset_ids:
	info = fetch_dataset_info(dataset_id)
	if info:
	dataset_info_list.append(info)
	else:
	st.warning("Authentication token not configured. Unable to fetch dataset information.")

	# Create a DataFrame from the collected information
	if dataset_info_list:
	df_datasets = pd.DataFrame({
	"Dataset Name": [info['id'] for info in dataset_info_list],
	"Description": [info['description'] for info in dataset_info_list],
	"Size (MB)": [info['size_mb'] for info in dataset_info_list],
	"Samples": [info['sample_count'] for info in dataset_info_list],
	"Last Modified": [info['last_modified'] for info in dataset_info_list]
	})

	# Display datasets table
	st.dataframe(df_datasets, use_container_width=True)
	else:
	st.error("No dataset information available. Please check your dataset IDs and authentication token.")


	# Dataset details with sample preview
	st.subheader("Dataset Preview")

	if dataset_info_list:
	selected_dataset = st.selectbox("Select a dataset to preview",
	[info['id'] for info in dataset_info_list],
	key="dataset_select")

	if selected_dataset:
	# Find the dataset info
	dataset_info = next((info for info in dataset_info_list if info['id'] == selected_dataset), None)

	if dataset_info:
	st.markdown(f"### {dataset_info['id']}")
	st.markdown(f"Description: {dataset_info['description']}")
	st.markdown(f"Size: {dataset_info['size_mb']} MB")
	st.markdown(f"Total Samples: {dataset_info['sample_count']:,}")
	st.markdown(f"Last Modified: {dataset_info['last_modified']}")

	# Show dataset samples
	st.markdown("### Sample Train Data")

	with st.spinner("Fetching dataset samples..."):
	samples = fetch_dataset_samples(selected_dataset)

	if samples:
	# Convert samples to DataFrame if possible
	try:
	# If it's a list of samples
	if isinstance(samples, list) and len(samples) > 0:
	# Try to normalize to handle nested structures
	df_sample = pd.json_normalize(samples)
	st.dataframe(df_sample, use_container_width=True)
	# If it's a single sample object
	elif isinstance(samples, dict):
	df_sample = pd.DataFrame([samples])
	st.dataframe(df_sample, use_container_width=True)
	else:
	st.json(samples)
	except Exception as e:
	st.error(f"Error displaying samples: {e}")
	st.json(samples) # Fallback to raw JSON display
	else:
	st.warning("Could not fetch dataset samples.")

	# Footer
	st.markdown("---")
	st.markdown("Repository Explorer \| Last updated: April 2025")