Spaces:

iroy99
/

dataset-tool

Runtime error

App Files Files Community

iaroy commited on May 19

Commit

fdc5d7a

1 Parent(s): f401f1d

Deploy full application code

Browse files

Files changed (17) hide show

.gitignore +39 -0
Dockerfile +8 -5
app.py +3 -13
app/api/__init__.py +7 -0
app/api/datasets.py +151 -0
app/core/celery_app.py +98 -0
app/core/config.py +48 -0
app/main.py +46 -0
app/schemas/dataset.py +81 -0
app/schemas/dataset_common.py +17 -0
app/services/hf_datasets.py +501 -0
app/services/redis_client.py +302 -0
app/tasks/dataset_tasks.py +73 -0
migrations/20250620000000_create_combined_datasets_table.sql +57 -0
setup.py +8 -0
tests/test_datasets.py +78 -0
tests/test_datasets_api.py +88 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,39 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Environment
+.env
+.venv
+env/
+venv/
+ENV/
+# Logs
+*.log
+logs/
+celery_worker_*.log
+nohup.out
+# Database
+*.sqlite
+*.db
+celerybeat-schedule

Dockerfile CHANGED Viewed

@@ -1,14 +1,17 @@
 # Use the official Python 3.10.9 image
 FROM python:3.10.9
-# Copy the current directory contents into the container at .
-COPY . .
-# Set the working directory to /
-WORKDIR /
-# Install requirements.txt
 RUN pip install --no-cache-dir --upgrade -r requirements.txt
 # Start the FastAPI app on port 7860, the default port expected by Spaces
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

 # Use the official Python 3.10.9 image
 FROM python:3.10.9
+# Set the working directory
+WORKDIR /app
+# Copy the current directory contents into the container
+COPY . .
+# Install requirements.txt
 RUN pip install --no-cache-dir --upgrade -r requirements.txt
+# Install the application in development mode
+RUN pip install -e .
 # Start the FastAPI app on port 7860, the default port expected by Spaces
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py CHANGED Viewed

@@ -1,17 +1,7 @@
-from fastapi import FastAPI
-import uvicorn
-# Create a FastAPI app for Hugging Face Spaces
-app = FastAPI(title="Collinear API")
-@app.get("/")
-async def root():
-    return {"message": "Welcome to Collinear API"}
-@app.get("/health")
-async def health():
-    return {"status": "healthy"}
 if __name__ == "__main__":
-    # This is used when running locally
     uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True)

+from app.main import app
+# This file is needed for Hugging Face Spaces to find the app
 if __name__ == "__main__":
+    import uvicorn
     uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True)

app/api/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from fastapi import APIRouter
+from app.api.datasets import router as datasets_router
+# from . import batch # Removed batch import
+api_router = APIRouter()
+api_router.include_router(datasets_router, tags=["datasets"])
+# api_router.include_router(batch.router) # Removed batch router

app/api/datasets.py ADDED Viewed

	@@ -0,0 +1,151 @@

+from fastapi import APIRouter, Query, HTTPException
+from typing import List, Optional, Dict, Any, Set
+from pydantic import BaseModel
+from fastapi.concurrency import run_in_threadpool
+from app.services.hf_datasets import (
+    get_dataset_commits,
+    get_dataset_files,
+    get_file_url,
+    get_datasets_page_from_zset,
+    get_dataset_commits_async,
+    get_dataset_files_async,
+    get_file_url_async,
+    get_datasets_page_from_cache,
+    fetch_and_cache_all_datasets,
+)
+from app.services.redis_client import cache_get
+import logging
+import time
+from fastapi.responses import JSONResponse
+import os
+router = APIRouter(prefix="/datasets", tags=["datasets"])
+log = logging.getLogger(__name__)
+SIZE_LOW = 100 * 1024 * 1024
+SIZE_MEDIUM = 1024 * 1024 * 1024
+class DatasetInfo(BaseModel):
+    id: str
+    name: Optional[str]
+    description: Optional[str]
+    size_bytes: Optional[int]
+    impact_level: Optional[str]
+    downloads: Optional[int]
+    likes: Optional[int]
+    tags: Optional[List[str]]
+    class Config:
+        extra = "ignore"
+class PaginatedDatasets(BaseModel):
+    total: int
+    items: List[DatasetInfo]
+class CommitInfo(BaseModel):
+    id: str
+    title: Optional[str]
+    message: Optional[str]
+    author: Optional[Dict[str, Any]]
+    date: Optional[str]
+class CacheStatus(BaseModel):
+    last_update: Optional[str]
+    total_items: int
+    warming_up: bool
+def deduplicate_by_id(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    seen: Set[str] = set()
+    unique_items = []
+    for item in items:
+        item_id = item.get("id")
+        if item_id and item_id not in seen:
+            seen.add(item_id)
+            unique_items.append(item)
+    return unique_items
+@router.get("/cache-status", response_model=CacheStatus)
+async def cache_status():
+    meta = await cache_get("hf:datasets:meta")
+    last_update = meta["last_update"] if meta and "last_update" in meta else None
+    total_items = meta["total_items"] if meta and "total_items" in meta else 0
+    warming_up = not bool(total_items)
+    return CacheStatus(last_update=last_update, total_items=total_items, warming_up=warming_up)
+@router.get("/", response_model=None)
+async def list_datasets(
+    limit: int = Query(10, ge=1, le=1000),
+    offset: int = Query(0, ge=0),
+    search: str = Query(None, description="Search term for dataset id or description"),
+    sort_by: str = Query(None, description="Field to sort by (e.g., 'downloads', 'likes', 'created_at')"),
+    sort_order: str = Query("desc", regex="^(asc|desc)$", description="Sort order: 'asc' or 'desc'"),
+):
+    # Fetch the full list from cache
+    result, status = get_datasets_page_from_cache(1000000, 0)  # get all for in-memory filtering
+    if status != 200:
+        return JSONResponse(result, status_code=status)
+    items = result["items"]
+    # Filtering
+    if search:
+        items = [d for d in items if search.lower() in (d.get("id", "") + " " + str(d.get("description", "")).lower())]
+    # Sorting
+    if sort_by:
+        items = sorted(items, key=lambda d: d.get(sort_by) or 0, reverse=(sort_order == "desc"))
+    # Pagination
+    total = len(items)
+    page = items[offset:offset+limit]
+    total_pages = (total + limit - 1) // limit
+    current_page = (offset // limit) + 1
+    next_page = current_page + 1 if offset + limit < total else None
+    prev_page = current_page - 1 if current_page > 1 else None
+    return {
+        "total": total,
+        "current_page": current_page,
+        "total_pages": total_pages,
+        "next_page": next_page,
+        "prev_page": prev_page,
+        "items": page
+    }
+@router.get("/{dataset_id:path}/commits", response_model=List[CommitInfo])
+async def get_commits(dataset_id: str):
+    """
+    Get commit history for a dataset.
+    """
+    try:
+        return await get_dataset_commits_async(dataset_id)
+    except Exception as e:
+        log.error(f"Error fetching commits for {dataset_id}: {e}")
+        raise HTTPException(status_code=404, detail=f"Could not fetch commits: {e}")
+@router.get("/{dataset_id:path}/files", response_model=List[str])
+async def list_files(dataset_id: str):
+    """
+    List files in a dataset.
+    """
+    try:
+        return await get_dataset_files_async(dataset_id)
+    except Exception as e:
+        log.error(f"Error listing files for {dataset_id}: {e}")
+        raise HTTPException(status_code=404, detail=f"Could not list files: {e}")
+@router.get("/{dataset_id:path}/file-url")
+async def get_file_url_endpoint(dataset_id: str, filename: str = Query(...), revision: Optional[str] = None):
+    """
+    Get download URL for a file in a dataset.
+    """
+    url = await get_file_url_async(dataset_id, filename, revision)
+    return {"download_url": url}
+@router.get("/meta")
+async def get_datasets_meta():
+    meta = await cache_get("hf:datasets:meta")
+    return meta if meta else {}
+# Endpoint to trigger cache refresh manually (for admin/testing)
+@router.post("/datasets/refresh-cache")
+def refresh_cache():
+    token = os.environ.get("HUGGINGFACEHUB_API_TOKEN")
+    if not token:
+        return JSONResponse({"error": "HUGGINGFACEHUB_API_TOKEN not set"}, status_code=500)
+    count = fetch_and_cache_all_datasets(token)
+    return {"status": "ok", "cached": count}

app/core/celery_app.py ADDED Viewed

	@@ -0,0 +1,98 @@

+"""Celery configuration for task processing."""
+import logging
+from celery import Celery
+from celery.signals import task_failure, task_success, worker_ready, worker_shutdown
+from app.core.config import settings
+# Configure logging
+logger = logging.getLogger(__name__)
+# Celery configuration
+celery_app = Celery(
+    "dataset_impacts",
+    broker=settings.REDIS_URL,
+    backend=settings.REDIS_URL,
+)
+# Configure Celery settings
+celery_app.conf.update(
+    task_serializer="json",
+    accept_content=["json"],
+    result_serializer="json",
+    timezone="UTC",
+    enable_utc=True,
+    worker_concurrency=settings.WORKER_CONCURRENCY,
+    task_acks_late=True,  # Tasks are acknowledged after execution
+    task_reject_on_worker_lost=True,  # Tasks are rejected if worker is terminated during execution
+    task_time_limit=3600,  # 1 hour timeout per task
+    task_soft_time_limit=3000,  # Soft timeout (30 minutes) - allows for graceful shutdown
+    worker_prefetch_multiplier=1,  # Single prefetch - improves fair distribution of tasks
+    broker_connection_retry=True,
+    broker_connection_retry_on_startup=True,
+    broker_connection_max_retries=10,
+    broker_pool_limit=10,  # Connection pool size
+    result_expires=60 * 60 * 24,  # Results expire after 24 hours
+    task_track_started=True,      # Track when tasks are started
+)
+# Set up task routes for different task types
+celery_app.conf.task_routes = {
+    "app.tasks.dataset_tasks.*": {"queue": "dataset_impacts"},
+    "app.tasks.maintenance.*": {"queue": "maintenance"},
+}
+# Configure retry settings
+celery_app.conf.task_default_retry_delay = 30  # 30 seconds
+celery_app.conf.task_max_retries = 3
+# Setup beat schedule for periodic tasks if enabled
+celery_app.conf.beat_schedule = {
+    "cleanup-stale-tasks": {
+        "task": "app.tasks.maintenance.cleanup_stale_tasks",
+        "schedule": 3600.0,  # Run every hour
+    },
+    "health-check": {
+        "task": "app.tasks.maintenance.health_check",
+        "schedule": 300.0,  # Run every 5 minutes
+    },
+    "refresh-hf-datasets-cache": {
+        "task": "app.tasks.dataset_tasks.refresh_hf_datasets_cache",
+        "schedule": 3600.0,  # Run every hour
+    },
+}
+# Signal handlers for monitoring and logging
+@task_failure.connect
+def task_failure_handler(sender=None, task_id=None, exception=None, **kwargs):
+    """Log failed tasks."""
+    logger.error(f"Task {task_id} {sender.name} failed: {exception}")
+@task_success.connect
+def task_success_handler(sender=None, result=None, **kwargs):
+    """Log successful tasks."""
+    task_name = sender.name if sender else "Unknown"
+    logger.info(f"Task {task_name} completed successfully")
+@worker_ready.connect
+def worker_ready_handler(**kwargs):
+    """Log when worker is ready."""
+    logger.info(f"Celery worker ready: {kwargs.get('hostname')}")
+@worker_shutdown.connect
+def worker_shutdown_handler(**kwargs):
+    """Log when worker is shutting down."""
+    logger.info(f"Celery worker shutting down: {kwargs.get('hostname')}")
+def get_celery_app():
+    """Get the Celery app instance."""
+    # Import all tasks to ensure they're registered
+    try:
+        # Using the improved app.tasks module which properly imports all tasks
+        import app.tasks
+        logger.info(f"Tasks successfully imported; registered {len(celery_app.tasks)} tasks")
+    except ImportError as e:
+        logger.error(f"Error importing tasks: {e}")
+    return celery_app

app/core/config.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from __future__ import annotations
+from typing import Final, Optional
+from pydantic import SecretStr, HttpUrl, Field
+from pydantic_settings import BaseSettings, SettingsConfigDict
+class Settings(BaseSettings):
+    """
+    Core application settings.
+    Reads environment variables and .env file.
+    """
+    # Supabase Settings
+    SUPABASE_URL: HttpUrl
+    SUPABASE_SERVICE_KEY: SecretStr
+    SUPABASE_ANON_KEY: SecretStr
+    SUPABASE_JWT_SECRET: Optional[SecretStr] = None # Optional for local dev
+    # Hugging Face API Token
+    HF_API_TOKEN: Optional[SecretStr] = None
+    # Redis settings
+    REDIS_URL: str = "redis://localhost:6379/0"
+    REDIS_PASSWORD: Optional[SecretStr] = None
+    # Toggle Redis cache layer
+    ENABLE_REDIS_CACHE: bool = True
+    # ──────────────────────────────── Security ────────────────────────────────
+    # JWT secret key. NEVER hard-code in source; override with env variable in production.
+    SECRET_KEY: SecretStr = Field("change-me", env="SECRET_KEY")
+    ACCESS_TOKEN_EXPIRE_MINUTES: int = Field(60 * 24 * 7, env="ACCESS_TOKEN_EXPIRE_MINUTES")  # 1 week by default
+    # Worker settings
+    WORKER_CONCURRENCY: int = 10  # Increased from 5 for better parallel performance
+    # Batch processing chunk size for Celery dataset tasks
+    DATASET_BATCH_CHUNK_SIZE: int = 50
+    # Tell pydantic-settings to auto-load `.env` if present
+    model_config: Final = SettingsConfigDict(
+        env_file=".env",
+        case_sensitive=False,
+        extra="ignore"
+    )
+# Single, shared instance of settings
+settings = Settings()

app/main.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import logging
+import json
+from fastapi import FastAPI
+from app.api import api_router
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.middleware.gzip import GZipMiddleware
+class JsonFormatter(logging.Formatter):
+    def format(self, record):
+        log_record = {
+            "level": record.levelname,
+            "time": self.formatTime(record, self.datefmt),
+            "name": record.name,
+            "message": record.getMessage(),
+        }
+        if record.exc_info:
+            log_record["exc_info"] = self.formatException(record.exc_info)
+        return json.dumps(log_record)
+handler = logging.StreamHandler()
+handler.setFormatter(JsonFormatter())
+logging.basicConfig(level=logging.INFO, handlers=[handler])
+app = FastAPI(title="Collinear API")
+# Enable CORS for the frontend
+frontend_origin = "http://localhost:5173"
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=[frontend_origin],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+app.add_middleware(GZipMiddleware, minimum_size=1000)
+app.include_router(api_router, prefix="/api")
+@app.get("/")
+async def root():
+    return {"message": "Welcome to the Collinear Data Tool API"}
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run("app.main:app", host="0.0.0.0", port=8000, reload=True)

app/schemas/dataset.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import logging
+from typing import Dict, List, Optional, Any
+from datetime import datetime
+from pydantic import BaseModel, Field
+from app.schemas.dataset_common import ImpactLevel, DatasetMetrics
+# Log for this module
+log = logging.getLogger(__name__)
+# Supported strategies for dataset combination
+SUPPORTED_STRATEGIES = ["merge", "intersect", "filter"]
+class ImpactAssessment(BaseModel):
+    dataset_id: str = Field(..., description="The ID of the dataset being assessed")
+    impact_level: ImpactLevel = Field(..., description="The impact level: low, medium, or high")
+    assessment_method: str = Field(
+        "unknown",
+        description="Method used to determine impact level (e.g., size_based, downloads_and_likes_based)"
+    )
+    metrics: DatasetMetrics = Field(
+        ...,
+        description="Metrics used for impact assessment"
+    )
+    thresholds: Dict[str, Dict[str, str]] = Field(
+        {},
+        description="Thresholds used for determining impact levels (for reference)"
+    )
+class DatasetInfo(BaseModel):
+    id: str
+    impact_level: Optional[ImpactLevel] = None
+    impact_assessment: Optional[Dict] = None
+    # Add other fields as needed
+    class Config:
+        extra = "allow"  # Allow extra fields from the API
+class DatasetBase(BaseModel):
+    name: str
+    description: Optional[str] = None
+    tags: Optional[List[str]] = None
+class DatasetCreate(DatasetBase):
+    files: Optional[List[str]] = None
+class DatasetUpdate(DatasetBase):
+    name: Optional[str] = None  # Make fields optional for updates
+class Dataset(DatasetBase):
+    id: int  # or str depending on your ID format
+    owner_id: str  # Assuming user IDs are strings
+    created_at: Optional[str] = None
+    updated_at: Optional[str] = None
+    class Config:
+        pass  # Removed orm_mode = True since ORM is not used
+class DatasetCombineRequest(BaseModel):
+    source_datasets: List[str] = Field(..., description="List of dataset IDs to combine")
+    name: str = Field(..., description="Name for the combined dataset")
+    description: Optional[str] = Field(None, description="Description for the combined dataset")
+    combination_strategy: str = Field("merge", description="Strategy to use when combining datasets (e.g., 'merge', 'intersect', 'filter')")
+    filter_criteria: Optional[Dict[str, Any]] = Field(None, description="Criteria for filtering when combining datasets")
+class CombinedDataset(BaseModel):
+    id: str = Field(..., description="ID of the combined dataset")
+    name: str = Field(..., description="Name of the combined dataset")
+    description: Optional[str] = Field(None, description="Description of the combined dataset")
+    source_datasets: List[str] = Field(..., description="IDs of the source datasets")
+    created_at: datetime = Field(..., description="Creation timestamp")
+    created_by: str = Field(..., description="ID of the user who created this combined dataset")
+    impact_level: Optional[ImpactLevel] = Field(None, description="Calculated impact level of the combined dataset")
+    status: str = Field("processing", description="Status of the dataset combination process")
+    combination_strategy: str = Field(..., description="Strategy used when combining datasets")
+    metrics: Optional[DatasetMetrics] = Field(None, description="Metrics for the combined dataset")
+    storage_bucket_id: Optional[str] = Field(None, description="ID of the storage bucket containing dataset files")
+    storage_folder_path: Optional[str] = Field(None, description="Path to the dataset files within the bucket")
+    class Config:
+        extra = "allow"  # Allow extra fields for flexibility
+__all__ = ["ImpactLevel", "ImpactAssessment", "DatasetInfo", "DatasetMetrics",
+           "Dataset", "DatasetCreate", "DatasetUpdate", "DatasetCombineRequest", "CombinedDataset"]

app/schemas/dataset_common.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from enum import Enum
+from pydantic import BaseModel, Field
+from typing import Optional
+# Define the impact level as an enum for better type safety
+class ImpactLevel(str, Enum):
+    NA = "not_available"  # New category for when size information is unavailable
+    LOW = "low"
+    MEDIUM = "medium"
+    HIGH = "high"
+# Define metrics model for impact assessment
+class DatasetMetrics(BaseModel):
+    size_bytes: Optional[int] = Field(None, description="Size of the dataset in bytes")
+    file_count: Optional[int] = Field(None, description="Number of files in the dataset")
+    downloads: Optional[int] = Field(None, description="Number of downloads (all time)")
+    likes: Optional[int] = Field(None, description="Number of likes")

app/services/hf_datasets.py ADDED Viewed

	@@ -0,0 +1,501 @@

+import logging
+import json
+from typing import Any, List, Optional, Dict, Tuple
+import requests
+from huggingface_hub import HfApi
+from app.core.config import settings
+from app.schemas.dataset_common import ImpactLevel
+from app.services.redis_client import sync_cache_set, sync_cache_get, generate_cache_key, get_redis_sync
+import time
+import asyncio
+import redis
+import gzip
+from datetime import datetime, timezone
+import os
+from app.schemas.dataset import ImpactAssessment
+from app.schemas.dataset_common import DatasetMetrics
+import httpx
+import redis.asyncio as aioredis
+log = logging.getLogger(__name__)
+api = HfApi()
+redis_client = redis.Redis(host="redis", port=6379, decode_responses=True)
+# Thresholds for impact categorization
+SIZE_THRESHOLD_LOW = 100 * 1024 * 1024  # 100 MB
+SIZE_THRESHOLD_MEDIUM = 1024 * 1024 * 1024  # 1 GB
+DOWNLOADS_THRESHOLD_LOW = 1000
+DOWNLOADS_THRESHOLD_MEDIUM = 10000
+LIKES_THRESHOLD_LOW = 10
+LIKES_THRESHOLD_MEDIUM = 100
+HF_API_URL = "https://huggingface.co/api/datasets"
+DATASET_CACHE_TTL = 60 * 60  # 1 hour
+# Redis and HuggingFace API setup
+REDIS_KEY = "hf:datasets:all:compressed"
+REDIS_META_KEY = "hf:datasets:meta"
+REDIS_TTL = 60 * 60  # 1 hour
+# Impact thresholds (in bytes)
+SIZE_LOW = 100 * 1024 * 1024
+SIZE_MEDIUM = 1024 * 1024 * 1024
+def get_hf_token():
+    token = os.environ.get("HUGGINGFACEHUB_API_TOKEN")
+    if not token:
+        raise RuntimeError("HUGGINGFACEHUB_API_TOKEN environment variable is not set. Please set it securely.")
+    return token
+def get_dataset_commits(dataset_id: str, limit: int = 20):
+    from huggingface_hub import HfApi
+    import logging
+    log = logging.getLogger(__name__)
+    api = HfApi()
+    log.info(f"[get_dataset_commits] Fetching commits for dataset_id={dataset_id}")
+    try:
+        commits = api.list_repo_commits(repo_id=dataset_id, repo_type="dataset")
+        log.info(f"[get_dataset_commits] Received {len(commits)} commits for {dataset_id}")
+    except Exception as e:
+        log.error(f"[get_dataset_commits] Error fetching commits for {dataset_id}: {e}", exc_info=True)
+        raise  # Let the API layer catch and handle this
+    result = []
+    for c in commits[:limit]:
+        try:
+            commit_id = getattr(c, "commit_id", "")
+            title = getattr(c, "title", "")
+            message = getattr(c, "message", title)
+            authors = getattr(c, "authors", [])
+            author_name = authors[0] if authors and isinstance(authors, list) else ""
+            created_at = getattr(c, "created_at", None)
+            if created_at:
+                if hasattr(created_at, "isoformat"):
+                    date = created_at.isoformat()
+                else:
+                    date = str(created_at)
+            else:
+                date = ""
+            result.append({
+                "id": commit_id or "",
+                "title": title or message or "",
+                "message": message or title or "",
+                "author": {"name": author_name, "email": ""},
+                "date": date,
+            })
+        except Exception as e:
+            log.error(f"[get_dataset_commits] Error parsing commit: {e} | Commit: {getattr(c, '__dict__', str(c))}", exc_info=True)
+    log.info(f"[get_dataset_commits] Returning {len(result)} parsed commits for {dataset_id}")
+    return result
+def get_dataset_files(dataset_id: str) -> List[str]:
+    return api.list_repo_files(repo_id=dataset_id, repo_type="dataset")
+def get_file_url(dataset_id: str, filename: str, revision: Optional[str] = None) -> str:
+    from huggingface_hub import hf_hub_url
+    return hf_hub_url(repo_id=dataset_id, filename=filename, repo_type="dataset", revision=revision)
+def get_datasets_page_from_zset(offset: int = 0, limit: int = 10, search: str = None) -> dict:
+    import redis
+    import json
+    redis_client = redis.Redis(host="redis", port=6379, db=0, decode_responses=True)
+    zset_key = "hf:datasets:all:zset"
+    hash_key = "hf:datasets:all:hash"
+    # Get total count
+    total = redis_client.zcard(zset_key)
+    # Get dataset IDs for the page
+    ids = redis_client.zrange(zset_key, offset, offset + limit - 1)
+    # Fetch metadata for those IDs
+    if not ids:
+        return {"items": [], "count": total}
+    items = redis_client.hmget(hash_key, ids)
+    # Parse JSON and filter/search if needed
+    parsed = []
+    for raw in items:
+        if not raw:
+            continue
+        try:
+            item = json.loads(raw)
+            parsed.append(item)
+        except Exception:
+            continue
+    if search:
+        parsed = [d for d in parsed if search.lower() in (d.get("id") or "").lower()]
+    return {"items": parsed, "count": total}
+async def _fetch_size(session: httpx.AsyncClient, dataset_id: str) -> Optional[int]:
+    """Fetch dataset size from the datasets server asynchronously."""
+    url = f"https://datasets-server.huggingface.co/size?dataset={dataset_id}"
+    try:
+        resp = await session.get(url, timeout=30)
+        if resp.status_code == 200:
+            data = resp.json()
+            return data.get("size", {}).get("dataset", {}).get("num_bytes_original_files")
+    except Exception as e:
+        log.warning(f"Could not fetch size for {dataset_id}: {e}")
+    return None
+async def _fetch_sizes(dataset_ids: List[str]) -> Dict[str, Optional[int]]:
+    """Fetch dataset sizes in parallel."""
+    results: Dict[str, Optional[int]] = {}
+    async with httpx.AsyncClient() as session:
+        tasks = {dataset_id: asyncio.create_task(_fetch_size(session, dataset_id)) for dataset_id in dataset_ids}
+        for dataset_id, task in tasks.items():
+            results[dataset_id] = await task
+    return results
+def process_datasets_page(offset, limit):
+    """
+    Fetch and process a single page of datasets from Hugging Face and cache them in Redis.
+    """
+    import redis
+    import os
+    import json
+    import asyncio
+    log = logging.getLogger(__name__)
+    log.info(f"[process_datasets_page] ENTRY: offset={offset}, limit={limit}")
+    token = os.environ.get("HUGGINGFACEHUB_API_TOKEN")
+    if not token:
+        log.error("[process_datasets_page] HUGGINGFACEHUB_API_TOKEN environment variable is not set.")
+        raise RuntimeError("HUGGINGFACEHUB_API_TOKEN environment variable is not set. Please set it securely.")
+    headers = {
+        "Authorization": f"Bearer {token}",
+        "User-Agent": "Mozilla/5.0 (compatible; CollinearTool/1.0; +https://yourdomain.com)"
+    }
+    params = {"limit": limit, "offset": offset, "full": "True"}
+    redis_client = redis.Redis(host="redis", port=6379, db=0, decode_responses=True)
+    stream_key = "hf:datasets:all:stream"
+    zset_key = "hf:datasets:all:zset"
+    hash_key = "hf:datasets:all:hash"
+    try:
+        log.info(f"[process_datasets_page] Requesting {HF_API_URL} with params={params}")
+        response = requests.get(HF_API_URL, headers=headers, params=params, timeout=120)
+        response.raise_for_status()
+        page_items = response.json()
+        log.info(f"[process_datasets_page] Received {len(page_items)} datasets at offset {offset}")
+        dataset_ids = [ds.get("id") for ds in page_items]
+        size_map = asyncio.run(_fetch_sizes(dataset_ids))
+        for ds in page_items:
+            dataset_id = ds.get("id")
+            size_bytes = size_map.get(dataset_id)
+            downloads = ds.get("downloads")
+            likes = ds.get("likes")
+            impact_level, assessment_method = determine_impact_level_by_criteria(size_bytes, downloads, likes)
+            metrics = DatasetMetrics(size_bytes=size_bytes, downloads=downloads, likes=likes)
+            thresholds = {
+                "size_bytes": {
+                    "low": str(100 * 1024 * 1024),
+                    "medium": str(1 * 1024 * 1024 * 1024),
+                    "high": str(10 * 1024 * 1024 * 1024)
+                }
+            }
+            impact_assessment = ImpactAssessment(
+                dataset_id=dataset_id,
+                impact_level=impact_level,
+                assessment_method=assessment_method,
+                metrics=metrics,
+                thresholds=thresholds
+            ).model_dump()
+            item = {
+                "id": dataset_id,
+                "name": ds.get("name"),
+                "description": ds.get("description"),
+                "size_bytes": size_bytes,
+                "impact_level": impact_level.value if isinstance(impact_level, ImpactLevel) else impact_level,
+                "downloads": downloads,
+                "likes": likes,
+                "tags": ds.get("tags", []),
+                "impact_assessment": json.dumps(impact_assessment)
+            }
+            final_item = {}
+            for k, v in item.items():
+                if isinstance(v, list) or isinstance(v, dict):
+                     final_item[k] = json.dumps(v)
+                elif v is None:
+                    final_item[k] = 'null'
+                else:
+                    final_item[k] = str(v)
+            redis_client.xadd(stream_key, final_item)
+            redis_client.zadd(zset_key, {dataset_id: offset})
+            redis_client.hset(hash_key, dataset_id, json.dumps(item))
+        log.info(f"[process_datasets_page] EXIT: Cached {len(page_items)} datasets at offset {offset}")
+        return len(page_items)
+    except Exception as exc:
+        log.error(f"[process_datasets_page] ERROR: offset={offset}, limit={limit}, exc={exc}", exc_info=True)
+        raise
+def refresh_datasets_cache():
+    """
+    Orchestrator: Enqueue Celery tasks to fetch all Hugging Face datasets in parallel.
+    Uses direct calls to HF API.
+    """
+    import requests
+    log.info("[refresh_datasets_cache] Orchestrating dataset fetch tasks using direct HF API calls.")
+    token = os.environ.get("HUGGINGFACEHUB_API_TOKEN")
+    if not token:
+        log.error("[refresh_datasets_cache] HUGGINGFACEHUB_API_TOKEN environment variable is not set.")
+        raise RuntimeError("HUGGINGFACEHUB_API_TOKEN environment variable is not set. Please set it securely.")
+    headers = {
+        "Authorization": f"Bearer {token}",
+        "User-Agent": "Mozilla/5.0 (compatible; CollinearTool/1.0; +https://yourdomain.com)"
+    }
+    limit = 500
+    params = {"limit": 1, "offset": 0}
+    try:
+        response = requests.get(HF_API_URL, headers=headers, params=params, timeout=120)
+        response.raise_for_status()
+        total_str = response.headers.get('X-Total-Count')
+        if not total_str:
+            log.error("[refresh_datasets_cache] 'X-Total-Count' header not found in HF API response.")
+            raise ValueError("'X-Total-Count' header missing from Hugging Face API response.")
+        total = int(total_str)
+        log.info(f"[refresh_datasets_cache] Total datasets reported by HF API: {total}")
+    except requests.RequestException as e:
+        log.error(f"[refresh_datasets_cache] Error fetching total dataset count from HF API: {e}")
+        raise
+    except ValueError as e:
+        log.error(f"[refresh_datasets_cache] Error parsing total dataset count: {e}")
+        raise
+    num_pages = (total + limit - 1) // limit
+    from app.tasks.dataset_tasks import fetch_datasets_page
+    from celery import group
+    tasks = []
+    for page_num in range(num_pages):
+        offset = page_num * limit
+        tasks.append(fetch_datasets_page.s(offset, limit))
+        log.info(f"[refresh_datasets_cache] Scheduled page at offset {offset}, limit {limit}.")
+    if tasks:
+        group(tasks).apply_async()
+        log.info(f"[refresh_datasets_cache] Enqueued {len(tasks)} fetch tasks.")
+    else:
+        log.warning("[refresh_datasets_cache] No dataset pages found to schedule.")
+def determine_impact_level_by_criteria(size_bytes, downloads=None, likes=None):
+    try:
+        size = int(size_bytes) if size_bytes not in (None, 'null') else 0
+    except Exception:
+        size = 0
+    # Prefer size_bytes if available
+    if size >= 10 * 1024 * 1024 * 1024:
+        return ("high", "large_size")
+    elif size >= 1 * 1024 * 1024 * 1024:
+        return ("medium", "medium_size")
+    elif size >= 100 * 1024 * 1024:
+        return ("low", "small_size")
+    # Fallback to downloads if size_bytes is missing or too small
+    if downloads is not None:
+        try:
+            downloads = int(downloads)
+            if downloads >= 100000:
+                return ("high", "downloads")
+            elif downloads >= 10000:
+                return ("medium", "downloads")
+            elif downloads >= 1000:
+                return ("low", "downloads")
+        except Exception:
+            pass
+    # Fallback to likes if downloads is missing
+    if likes is not None:
+        try:
+            likes = int(likes)
+            if likes >= 1000:
+                return ("high", "likes")
+            elif likes >= 100:
+                return ("medium", "likes")
+            elif likes >= 10:
+                return ("low", "likes")
+        except Exception:
+            pass
+    return ("not_available", "size_and_downloads_and_likes_unknown")
+def get_dataset_size(dataset: dict, dataset_id: str = None):
+    """
+    Extract the size in bytes from a dataset dictionary.
+    Tries multiple locations based on possible HuggingFace API responses.
+    """
+    # Try top-level key
+    size_bytes = dataset.get("size_bytes")
+    if size_bytes not in (None, 'null'):
+        return size_bytes
+    # Try nested structure from the size API
+    size_bytes = (
+        dataset.get("size", {})
+        .get("dataset", {})
+        .get("num_bytes_original_files")
+    )
+    if size_bytes not in (None, 'null'):
+        return size_bytes
+    # Try metrics or info sub-dictionaries if present
+    for key in ["metrics", "info"]:
+        sub = dataset.get(key, {})
+        if isinstance(sub, dict):
+            size_bytes = sub.get("size_bytes")
+            if size_bytes not in (None, 'null'):
+                return size_bytes
+    # Not found
+    return None
+async def get_datasets_page_from_zset_async(offset: int = 0, limit: int = 10, search: str = None) -> dict:
+    redis_client = aioredis.Redis(host="redis", port=6379, db=0, decode_responses=True)
+    zset_key = "hf:datasets:all:zset"
+    hash_key = "hf:datasets:all:hash"
+    total = await redis_client.zcard(zset_key)
+    ids = await redis_client.zrange(zset_key, offset, offset + limit - 1)
+    if not ids:
+        return {"items": [], "count": total}
+    items = await redis_client.hmget(hash_key, ids)
+    parsed = []
+    for raw in items:
+        if not raw:
+            continue
+        try:
+            item = json.loads(raw)
+            parsed.append(item)
+        except Exception:
+            continue
+    if search:
+        parsed = [d for d in parsed if search.lower() in (d.get("id") or "").lower()]
+    return {"items": parsed, "count": total}
+async def get_dataset_commits_async(dataset_id: str, limit: int = 20):
+    from huggingface_hub import HfApi
+    import logging
+    log = logging.getLogger(__name__)
+    api = HfApi()
+    log.info(f"[get_dataset_commits_async] Fetching commits for dataset_id={dataset_id}")
+    try:
+        # huggingface_hub is sync, so run in threadpool
+        import anyio
+        commits = await anyio.to_thread.run_sync(api.list_repo_commits, repo_id=dataset_id, repo_type="dataset")
+        log.info(f"[get_dataset_commits_async] Received {len(commits)} commits for {dataset_id}")
+    except Exception as e:
+        log.error(f"[get_dataset_commits_async] Error fetching commits for {dataset_id}: {e}", exc_info=True)
+        raise
+    result = []
+    for c in commits[:limit]:
+        try:
+            commit_id = getattr(c, "commit_id", "")
+            title = getattr(c, "title", "")
+            message = getattr(c, "message", title)
+            authors = getattr(c, "authors", [])
+            author_name = authors[0] if authors and isinstance(authors, list) else ""
+            created_at = getattr(c, "created_at", None)
+            if created_at:
+                if hasattr(created_at, "isoformat"):
+                    date = created_at.isoformat()
+                else:
+                    date = str(created_at)
+            else:
+                date = ""
+            result.append({
+                "id": commit_id or "",
+                "title": title or message or "",
+                "message": message or title or "",
+                "author": {"name": author_name, "email": ""},
+                "date": date,
+            })
+        except Exception as e:
+            log.error(f"[get_dataset_commits_async] Error parsing commit: {e} | Commit: {getattr(c, '__dict__', str(c))}", exc_info=True)
+    log.info(f"[get_dataset_commits_async] Returning {len(result)} parsed commits for {dataset_id}")
+    return result
+async def get_dataset_files_async(dataset_id: str) -> List[str]:
+    from huggingface_hub import HfApi
+    import anyio
+    api = HfApi()
+    # huggingface_hub is sync, so run in threadpool
+    return await anyio.to_thread.run_sync(api.list_repo_files, repo_id=dataset_id, repo_type="dataset")
+async def get_file_url_async(dataset_id: str, filename: str, revision: Optional[str] = None) -> str:
+    from huggingface_hub import hf_hub_url
+    import anyio
+    # huggingface_hub is sync, so run in threadpool
+    return await anyio.to_thread.run_sync(hf_hub_url, repo_id=dataset_id, filename=filename, repo_type="dataset", revision=revision)
+# Fetch and cache all datasets
+class EnhancedJSONEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, datetime):
+            return obj.isoformat()
+        return super().default(obj)
+async def fetch_size(session, dataset_id, token=None):
+    url = f"https://datasets-server.huggingface.co/size?dataset={dataset_id}"
+    headers = {"Authorization": f"Bearer {token}"} if token else {}
+    try:
+        resp = await session.get(url, headers=headers, timeout=30)
+        if resp.status_code == 200:
+            data = resp.json()
+            return dataset_id, data.get("size", {}).get("dataset", {}).get("num_bytes_original_files")
+    except Exception as e:
+        log.warning(f"Could not fetch size for {dataset_id}: {e}")
+    return dataset_id, None
+async def fetch_all_sizes(dataset_ids, token=None, batch_size=50):
+    results = {}
+    async with httpx.AsyncClient() as session:
+        for i in range(0, len(dataset_ids), batch_size):
+            batch = dataset_ids[i:i+batch_size]
+            tasks = [fetch_size(session, ds_id, token) for ds_id in batch]
+            batch_results = await asyncio.gather(*tasks)
+            for ds_id, size in batch_results:
+                results[ds_id] = size
+    return results
+def fetch_and_cache_all_datasets(token: str):
+    api = HfApi(token=token)
+    log.info("Fetching all datasets from Hugging Face Hub...")
+    all_datasets = list(api.list_datasets())
+    all_datasets_dicts = []
+    dataset_ids = [d.id for d in all_datasets]
+    # Fetch all sizes in batches
+    sizes = asyncio.run(fetch_all_sizes(dataset_ids, token=token, batch_size=50))
+    for d in all_datasets:
+        data = d.__dict__
+        size_bytes = sizes.get(d.id)
+        downloads = data.get("downloads")
+        likes = data.get("likes")
+        data["size_bytes"] = size_bytes
+        impact_level, _ = determine_impact_level_by_criteria(size_bytes, downloads, likes)
+        data["impact_level"] = impact_level
+        all_datasets_dicts.append(data)
+    compressed = gzip.compress(json.dumps(all_datasets_dicts, cls=EnhancedJSONEncoder).encode("utf-8"))
+    r = redis.Redis(host="redis", port=6379, decode_responses=False)
+    r.set(REDIS_KEY, compressed)
+    log.info(f"Cached {len(all_datasets_dicts)} datasets in Redis under {REDIS_KEY}")
+    return len(all_datasets_dicts)
+# Native pagination from cache
+def get_datasets_page_from_cache(limit: int, offset: int):
+    r = redis.Redis(host="redis", port=6379, decode_responses=False)
+    compressed = r.get(REDIS_KEY)
+    if not compressed:
+        return {"error": "Cache not found. Please refresh datasets."}, 404
+    all_datasets = json.loads(gzip.decompress(compressed).decode("utf-8"))
+    total = len(all_datasets)
+    if offset < 0 or offset >= total:
+        return {"error": "Offset out of range.", "total": total}, 400
+    page = all_datasets[offset:offset+limit]
+    total_pages = (total + limit - 1) // limit
+    current_page = (offset // limit) + 1
+    next_page = current_page + 1 if offset + limit < total else None
+    prev_page = current_page - 1 if current_page > 1 else None
+    return {
+        "total": total,
+        "current_page": current_page,
+        "total_pages": total_pages,
+        "next_page": next_page,
+        "prev_page": prev_page,
+        "items": page
+    }, 200

app/services/redis_client.py ADDED Viewed

	@@ -0,0 +1,302 @@

+"""Redis client for caching and task queue management."""
+import json
+from typing import Any, Dict, Optional, TypeVar
+from datetime import datetime
+import logging
+from time import time as _time
+import redis.asyncio as redis_async
+import redis as redis_sync  # Import synchronous Redis client
+from pydantic import BaseModel
+from tenacity import retry, stop_after_attempt, wait_exponential
+from app.core.config import settings
+# Type variable for cache
+T = TypeVar('T')
+# Configure logging
+log = logging.getLogger(__name__)
+# Redis connection pools for reusing connections
+_redis_pool_async = None
+_redis_pool_sync = None  # Synchronous pool
+# Default cache expiration (12 hours)
+DEFAULT_CACHE_EXPIRY = 60 * 60 * 12
+@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=1, max=10))
+async def get_redis_pool() -> redis_async.Redis:
+    """Get or create async Redis connection pool with retry logic."""
+    global _redis_pool_async
+    if _redis_pool_async is None:
+        # Get Redis configuration from settings
+        redis_url = settings.REDIS_URL or "redis://localhost:6379/0"
+        try:
+            # Create connection pool with reasonable defaults
+            _redis_pool_async = redis_async.ConnectionPool.from_url(
+                redis_url,
+                max_connections=10,
+                decode_responses=True,
+                health_check_interval=5,
+                socket_connect_timeout=5,
+                socket_keepalive=True,
+                retry_on_timeout=True
+            )
+            log.info(f"Created async Redis connection pool with URL: {redis_url}")
+        except Exception as e:
+            log.error(f"Error creating async Redis connection pool: {e}")
+            raise
+    return redis_async.Redis(connection_pool=_redis_pool_async)
+def get_redis_pool_sync() -> redis_sync.Redis:
+    """Get or create synchronous Redis connection pool."""
+    global _redis_pool_sync
+    if _redis_pool_sync is None:
+        # Get Redis configuration from settings
+        redis_url = settings.REDIS_URL or "redis://localhost:6379/0"
+        try:
+            # Create connection pool with reasonable defaults
+            _redis_pool_sync = redis_sync.ConnectionPool.from_url(
+                redis_url,
+                max_connections=10,
+                decode_responses=True,
+                socket_connect_timeout=5,
+                socket_keepalive=True,
+                retry_on_timeout=True
+            )
+            log.info(f"Created sync Redis connection pool with URL: {redis_url}")
+        except Exception as e:
+            log.error(f"Error creating sync Redis connection pool: {e}")
+            raise
+    return redis_sync.Redis(connection_pool=_redis_pool_sync)
+@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=1, max=5))
+async def get_redis() -> redis_async.Redis:
+    """Get Redis client from pool with retry logic."""
+    try:
+        redis_client = await get_redis_pool()
+        return redis_client
+    except Exception as e:
+        log.error(f"Error getting Redis client: {e}")
+        raise
+@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=1, max=5))
+def get_redis_sync() -> redis_sync.Redis:
+    """Get synchronous Redis client from pool with retry logic."""
+    try:
+        return get_redis_pool_sync()
+    except Exception as e:
+        log.error(f"Error getting synchronous Redis client: {e}")
+        raise
+# Cache key generation
+def generate_cache_key(prefix: str, *args: Any) -> str:
+    """Generate cache key with prefix and args."""
+    key_parts = [prefix] + [str(arg) for arg in args if arg]
+    return ":".join(key_parts)
+# JSON serialization helpers
+def _json_serialize(obj: Any) -> str:
+    """Serialize object to JSON with datetime support."""
+    def _serialize_datetime(o: Any) -> str:
+        if isinstance(o, datetime):
+            return o.isoformat()
+        if isinstance(o, BaseModel):
+            return o.dict()
+        return str(o)
+    return json.dumps(obj, default=_serialize_datetime)
+def _json_deserialize(data: str, model_class: Optional[type] = None) -> Any:
+    """Deserialize JSON string to object with datetime support."""
+    result = json.loads(data)
+    if model_class and issubclass(model_class, BaseModel):
+        return model_class.parse_obj(result)
+    return result
+# Async cache operations
+async def cache_set(key: str, value: Any, expire: int = DEFAULT_CACHE_EXPIRY) -> bool:
+    """Set cache value with expiration (async version)."""
+    redis_client = await get_redis()
+    serialized = _json_serialize(value)
+    try:
+        await redis_client.set(key, serialized, ex=expire)
+        log.debug(f"Cached data at key: {key}, expires in {expire}s")
+        return True
+    except Exception as e:
+        log.error(f"Error caching data at key {key}: {e}")
+        return False
+async def cache_get(key: str, model_class: Optional[type] = None) -> Optional[Any]:
+    """Get cache value with optional model deserialization (async version)."""
+    redis_client = await get_redis()
+    try:
+        data = await redis_client.get(key)
+        if not data:
+            return None
+        log.debug(f"Cache hit for key: {key}")
+        return _json_deserialize(data, model_class)
+    except Exception as e:
+        log.error(f"Error retrieving cache for key {key}: {e}")
+        return None
+# Synchronous cache operations for Celery tasks
+def sync_cache_set(key: str, value: Any, expire: int = DEFAULT_CACHE_EXPIRY) -> bool:
+    """Set cache value with expiration (synchronous version for Celery tasks). Logs slow operations."""
+    redis_client = get_redis_sync()
+    serialized = _json_serialize(value)
+    start = _time()
+    try:
+        redis_client.set(key, serialized, ex=expire)
+        elapsed = _time() - start
+        if elapsed > 2:
+            log.warning(f"Slow sync_cache_set for key {key}: {elapsed:.2f}s")
+        log.debug(f"Cached data at key: {key}, expires in {expire}s (sync)")
+        return True
+    except Exception as e:
+        log.error(f"Error caching data at key {key}: {e}")
+        return False
+def sync_cache_get(key: str, model_class: Optional[type] = None) -> Optional[Any]:
+    """Get cache value with optional model deserialization (synchronous version for Celery tasks). Logs slow operations."""
+    redis_client = get_redis_sync()
+    start = _time()
+    try:
+        data = redis_client.get(key)
+        elapsed = _time() - start
+        if elapsed > 2:
+            log.warning(f"Slow sync_cache_get for key {key}: {elapsed:.2f}s")
+        if not data:
+            return None
+        log.debug(f"Cache hit for key: {key} (sync)")
+        return _json_deserialize(data, model_class)
+    except Exception as e:
+        log.error(f"Error retrieving cache for key {key}: {e}")
+        return None
+async def cache_invalidate(key: str) -> bool:
+    """Invalidate cache for key."""
+    redis_client = await get_redis()
+    try:
+        await redis_client.delete(key)
+        log.debug(f"Invalidated cache for key: {key}")
+        return True
+    except Exception as e:
+        log.error(f"Error invalidating cache for key {key}: {e}")
+        return False
+async def cache_invalidate_pattern(pattern: str) -> int:
+    """Invalidate all cache keys matching pattern."""
+    redis_client = await get_redis()
+    try:
+        keys = await redis_client.keys(pattern)
+        if not keys:
+            return 0
+        count = await redis_client.delete(*keys)
+        log.debug(f"Invalidated {count} keys matching pattern: {pattern}")
+        return count
+    except Exception as e:
+        log.error(f"Error invalidating keys with pattern {pattern}: {e}")
+        return 0
+# Task queue operations
+async def enqueue_task(queue_name: str, task_id: str, payload: Dict[str, Any]) -> bool:
+    """Add task to queue."""
+    redis_client = await get_redis()
+    try:
+        serialized = _json_serialize(payload)
+        await redis_client.lpush(f"queue:{queue_name}", serialized)
+        await redis_client.hset(f"tasks:{queue_name}", task_id, "pending")
+        log.info(f"Enqueued task {task_id} to queue {queue_name}")
+        return True
+    except Exception as e:
+        log.error(f"Error enqueueing task {task_id} to {queue_name}: {e}")
+        return False
+async def mark_task_complete(queue_name: str, task_id: str, result: Optional[Dict[str, Any]] = None) -> bool:
+    """Mark task as complete with optional result."""
+    redis_client = await get_redis()
+    try:
+        # Store result if provided
+        if result:
+            await redis_client.hset(
+                f"results:{queue_name}",
+                task_id,
+                _json_serialize(result)
+            )
+        # Mark task as complete
+        await redis_client.hset(f"tasks:{queue_name}", task_id, "complete")
+        await redis_client.expire(f"tasks:{queue_name}", 86400)  # Expire after 24 hours
+        log.info(f"Marked task {task_id} as complete in queue {queue_name}")
+        return True
+    except Exception as e:
+        log.error(f"Error marking task {task_id} as complete: {e}")
+        return False
+async def get_task_status(queue_name: str, task_id: str) -> Optional[str]:
+    """Get status of a task."""
+    redis_client = await get_redis()
+    try:
+        status = await redis_client.hget(f"tasks:{queue_name}", task_id)
+        return status
+    except Exception as e:
+        log.error(f"Error getting status for task {task_id}: {e}")
+        return None
+async def get_task_result(queue_name: str, task_id: str) -> Optional[Dict[str, Any]]:
+    """Get result of a completed task."""
+    redis_client = await get_redis()
+    try:
+        data = await redis_client.hget(f"results:{queue_name}", task_id)
+        if not data:
+            return None
+        return _json_deserialize(data)
+    except Exception as e:
+        log.error(f"Error getting result for task {task_id}: {e}")
+        return None
+# Stream processing for real-time updates
+async def add_to_stream(stream: str, data: Dict[str, Any], max_len: int = 1000) -> str:
+    """Add event to Redis stream."""
+    redis_client = await get_redis()
+    try:
+        # Convert dict values to strings (Redis streams requirement)
+        entry = {k: _json_serialize(v) for k, v in data.items()}
+        # Add to stream with automatic ID generation
+        event_id = await redis_client.xadd(
+            stream,
+            entry,
+            maxlen=max_len,
+            approximate=True
+        )
+        log.debug(f"Added event {event_id} to stream {stream}")
+        return event_id
+    except Exception as e:
+        log.error(f"Error adding to stream {stream}: {e}")
+        raise

app/tasks/dataset_tasks.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import logging
+import time
+import asyncio
+from datetime import datetime, timezone
+from typing import Dict, List, Any, Optional, Tuple
+from celery import Task, shared_task
+from app.core.celery_app import get_celery_app
+from app.services.hf_datasets import (
+    determine_impact_level_by_criteria,
+    get_hf_token,
+    get_dataset_size,
+    refresh_datasets_cache,
+    fetch_and_cache_all_datasets,
+)
+from app.services.redis_client import sync_cache_set, sync_cache_get, generate_cache_key
+from app.core.config import settings
+import requests
+import os
+# Configure logging
+logger = logging.getLogger(__name__)
+# Get Celery app instance
+celery_app = get_celery_app()
+# Constants
+DATASET_CACHE_TTL = 60 * 60 * 24 * 30  # 30 days
+BATCH_PROGRESS_CACHE_TTL = 60 * 60 * 24 * 7  # 7 days for batch progress
+DATASET_SIZE_CACHE_TTL = 60 * 60 * 24 * 30  # 30 days for size info
+@celery_app.task(name="app.tasks.dataset_tasks.refresh_hf_datasets_cache")
+def refresh_hf_datasets_cache():
+    """Celery task to refresh the HuggingFace datasets cache in Redis."""
+    logger.info("Starting refresh of HuggingFace datasets cache via Celery task.")
+    try:
+        refresh_datasets_cache()
+        logger.info("Successfully refreshed HuggingFace datasets cache.")
+        return {"status": "success"}
+    except Exception as e:
+        logger.error(f"Failed to refresh HuggingFace datasets cache: {e}")
+        return {"status": "error", "error": str(e)}
+@shared_task(bind=True, max_retries=3, default_retry_delay=10)
+def fetch_datasets_page(self, offset, limit):
+    """
+    Celery task to fetch and cache a single page of datasets from Hugging Face.
+    Retries on failure.
+    """
+    logger.info(f"[fetch_datasets_page] ENTRY: offset={offset}, limit={limit}")
+    try:
+        from app.services.hf_datasets import process_datasets_page
+        logger.info(f"[fetch_datasets_page] Calling process_datasets_page with offset={offset}, limit={limit}")
+        result = process_datasets_page(offset, limit)
+        logger.info(f"[fetch_datasets_page] SUCCESS: offset={offset}, limit={limit}, result={result}")
+        return result
+    except Exception as exc:
+        logger.error(f"[fetch_datasets_page] ERROR: offset={offset}, limit={limit}, exc={exc}", exc_info=True)
+        raise self.retry(exc=exc)
+@shared_task(bind=True, max_retries=3, default_retry_delay=60)
+def refresh_hf_datasets_full_cache(self):
+    logger.info("[refresh_hf_datasets_full_cache] Starting full Hugging Face datasets cache refresh.")
+    try:
+        token = os.environ.get("HUGGINGFACEHUB_API_TOKEN")
+        if not token:
+            logger.error("[refresh_hf_datasets_full_cache] HUGGINGFACEHUB_API_TOKEN not set.")
+            return {"status": "error", "error": "HUGGINGFACEHUB_API_TOKEN not set"}
+        count = fetch_and_cache_all_datasets(token)
+        logger.info(f"[refresh_hf_datasets_full_cache] Cached {count} datasets.")
+        return {"status": "ok", "cached": count}
+    except Exception as exc:
+        logger.error(f"[refresh_hf_datasets_full_cache] ERROR: {exc}", exc_info=True)
+        raise self.retry(exc=exc)

migrations/20250620000000_create_combined_datasets_table.sql ADDED Viewed

	@@ -0,0 +1,57 @@

+-- Create combined_datasets table
+CREATE TABLE IF NOT EXISTS public.combined_datasets (
+    id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
+    name TEXT NOT NULL,
+    description TEXT,
+    source_datasets TEXT[] NOT NULL,
+    created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
+    updated_at TIMESTAMPTZ NOT NULL DEFAULT now(),
+    created_by UUID REFERENCES auth.users(id),
+    impact_level TEXT CHECK (impact_level = ANY (ARRAY['low', 'medium', 'high']::text[])),
+    status TEXT NOT NULL DEFAULT 'processing',
+    combination_strategy TEXT NOT NULL DEFAULT 'merge',
+    size_bytes BIGINT,
+    file_count INTEGER,
+    downloads INTEGER,
+    likes INTEGER
+);
+-- Add indexes for faster querying
+CREATE INDEX IF NOT EXISTS idx_combined_datasets_created_by ON public.combined_datasets(created_by);
+CREATE INDEX IF NOT EXISTS idx_combined_datasets_impact_level ON public.combined_datasets(impact_level);
+CREATE INDEX IF NOT EXISTS idx_combined_datasets_status ON public.combined_datasets(status);
+-- Add Row Level Security (RLS) policies
+ALTER TABLE public.combined_datasets ENABLE ROW LEVEL SECURITY;
+-- Policy to allow users to see all combined datasets
+CREATE POLICY "Anyone can view combined datasets"
+ON public.combined_datasets
+FOR SELECT USING (true);
+-- Policy to allow users to create their own combined datasets
+CREATE POLICY "Users can create their own combined datasets"
+ON public.combined_datasets
+FOR INSERT
+WITH CHECK (auth.uid() = created_by);
+-- Policy to allow users to update only their own combined datasets
+CREATE POLICY "Users can update their own combined datasets"
+ON public.combined_datasets
+FOR UPDATE
+USING (auth.uid() = created_by);
+-- Function to automatically update updated_at timestamp
+CREATE OR REPLACE FUNCTION update_combined_datasets_updated_at()
+RETURNS TRIGGER AS $$
+BEGIN
+    NEW.updated_at = now();
+    RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
+-- Trigger to automatically update updated_at timestamp
+CREATE TRIGGER update_combined_datasets_updated_at_trigger
+BEFORE UPDATE ON public.combined_datasets
+FOR EACH ROW
+EXECUTE FUNCTION update_combined_datasets_updated_at();

setup.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from setuptools import setup, find_packages
+setup(
+    name="collinear-tool",
+    version="0.1.0",
+    packages=find_packages(),
+    include_package_data=True,
+)

tests/test_datasets.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import os
+import pytest
+import requests
+BASE_URL = os.environ.get("BASE_URL", "http://127.0.0.1:8000/api")
+# --- /datasets ---
+def test_list_datasets_http():
+    resp = requests.get(f"{BASE_URL}/datasets")
+    assert resp.status_code == 200
+    data = resp.json()
+    assert "items" in data
+    assert "total" in data
+    assert "warming_up" in data
+def test_list_datasets_offset_limit_http():
+    resp = requests.get(f"{BASE_URL}/datasets?offset=0&limit=3")
+    assert resp.status_code == 200
+    data = resp.json()
+    assert isinstance(data["items"], list)
+    assert len(data["items"]) <= 3
+def test_list_datasets_large_offset_http():
+    resp = requests.get(f"{BASE_URL}/datasets?offset=99999&limit=2")
+    assert resp.status_code == 200
+    data = resp.json()
+    assert data["items"] == []
+    assert "warming_up" in data
+def test_list_datasets_invalid_limit_http():
+    resp = requests.get(f"{BASE_URL}/datasets?limit=-5")
+    assert resp.status_code == 422
+# --- /datasets/cache-status ---
+def test_cache_status_http():
+    resp = requests.get(f"{BASE_URL}/datasets/cache-status")
+    assert resp.status_code == 200
+    data = resp.json()
+    assert "warming_up" in data
+    assert "total_items" in data
+    assert "last_update" in data
+# --- /datasets/{dataset_id}/commits ---
+def test_commits_valid_http():
+    resp = requests.get(f"{BASE_URL}/datasets/openbmb/Ultra-FineWeb/commits")
+    assert resp.status_code in (200, 404)
+    if resp.status_code == 200:
+        assert isinstance(resp.json(), list)
+def test_commits_invalid_http():
+    resp = requests.get(f"{BASE_URL}/datasets/invalid-dataset-id/commits")
+    assert resp.status_code in (404, 422)
+# --- /datasets/{dataset_id}/files ---
+def test_files_valid_http():
+    resp = requests.get(f"{BASE_URL}/datasets/openbmb/Ultra-FineWeb/files")
+    assert resp.status_code in (200, 404)
+    if resp.status_code == 200:
+        assert isinstance(resp.json(), list)
+def test_files_invalid_http():
+    resp = requests.get(f"{BASE_URL}/datasets/invalid-dataset-id/files")
+    assert resp.status_code in (404, 422)
+# --- /datasets/{dataset_id}/file-url ---
+def test_file_url_valid_http():
+    resp = requests.get(f"{BASE_URL}/datasets/openbmb/Ultra-FineWeb/file-url", params={"filename": "README.md"})
+    assert resp.status_code in (200, 404)
+    if resp.status_code == 200:
+        assert "download_url" in resp.json()
+def test_file_url_invalid_file_http():
+    resp = requests.get(f"{BASE_URL}/datasets/openbmb/Ultra-FineWeb/file-url", params={"filename": "not_a_real_file.txt"})
+    assert resp.status_code in (404, 200)
+def test_file_url_missing_filename_http():
+    resp = requests.get(f"{BASE_URL}/datasets/openbmb/Ultra-FineWeb/file-url")
+    assert resp.status_code in (404, 422)

tests/test_datasets_api.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import pytest
+from fastapi.testclient import TestClient
+from app.main import app
+client = TestClient(app)
+# --- /api/datasets ---
+def test_list_datasets_default():
+    resp = client.get("/api/datasets")
+    assert resp.status_code == 200
+    data = resp.json()
+    assert "items" in data
+    assert isinstance(data["items"], list)
+    assert "total" in data
+    assert "warming_up" in data
+def test_list_datasets_offset_limit():
+    resp = client.get("/api/datasets?offset=0&limit=2")
+    assert resp.status_code == 200
+    data = resp.json()
+    assert isinstance(data["items"], list)
+    assert len(data["items"]) <= 2
+def test_list_datasets_large_offset():
+    resp = client.get("/api/datasets?offset=100000&limit=2")
+    assert resp.status_code == 200
+    data = resp.json()
+    assert data["items"] == []
+    assert data["warming_up"] in (True, False)
+def test_list_datasets_negative_limit():
+    resp = client.get("/api/datasets?limit=-1")
+    assert resp.status_code == 422
+def test_list_datasets_missing_params():
+    resp = client.get("/api/datasets")
+    assert resp.status_code == 200
+    data = resp.json()
+    assert "items" in data
+    assert "total" in data
+    assert "warming_up" in data
+# --- /api/datasets/cache-status ---
+def test_cache_status():
+    resp = client.get("/api/datasets/cache-status")
+    assert resp.status_code == 200
+    data = resp.json()
+    assert "warming_up" in data
+    assert "total_items" in data
+    assert "last_update" in data
+# --- /api/datasets/{dataset_id}/commits ---
+def test_get_commits_valid():
+    resp = client.get("/api/datasets/openbmb/Ultra-FineWeb/commits")
+    # Accept 200 (found) or 404 (not found)
+    assert resp.status_code in (200, 404)
+    if resp.status_code == 200:
+        assert isinstance(resp.json(), list)
+def test_get_commits_invalid():
+    resp = client.get("/api/datasets/invalid-dataset-id/commits")
+    assert resp.status_code in (404, 422)
+# --- /api/datasets/{dataset_id}/files ---
+def test_list_files_valid():
+    resp = client.get("/api/datasets/openbmb/Ultra-FineWeb/files")
+    assert resp.status_code in (200, 404)
+    if resp.status_code == 200:
+        assert isinstance(resp.json(), list)
+def test_list_files_invalid():
+    resp = client.get("/api/datasets/invalid-dataset-id/files")
+    assert resp.status_code in (404, 422)
+# --- /api/datasets/{dataset_id}/file-url ---
+def test_get_file_url_valid():
+    resp = client.get("/api/datasets/openbmb/Ultra-FineWeb/file-url", params={"filename": "README.md"})
+    assert resp.status_code in (200, 404)
+    if resp.status_code == 200:
+        assert "download_url" in resp.json()
+def test_get_file_url_invalid_file():
+    resp = client.get("/api/datasets/openbmb/Ultra-FineWeb/file-url", params={"filename": "not_a_real_file.txt"})
+    assert resp.status_code in (404, 200)
+def test_get_file_url_missing_filename():
+    resp = client.get("/api/datasets/openbmb/Ultra-FineWeb/file-url")
+    assert resp.status_code in (404, 422)