--- title: Swe Text Summarizer emoji: 🔥 colorFrom: indigo colorTo: indigo sdk: gradio sdk_version: 3.4 app_file: app.py pinned: true --- import numpy as np import time from pathlib import Path # Import your PyLaia implementation from htrflow.models.teklia.pylaia import PyLaia from htrflow.utils.imgproc import read NORMAL_IMAGE_PATH = "examples/images/lines/A0068699_00021_region0_line1.jpg" def create_test_images(base_image_path, num_images=100): """Create test images - mix of real and synthetic variations.""" images = [] # Load the real image real_image = read(base_image_path) # Create test images for i in range(num_images): if i % 2 == 0: # Use the real image images.append(real_image.copy()) else: # Create a slightly modified version (add some noise) noisy_image = real_image.copy() noise = np.random.normal(0, 10, real_image.shape).astype(np.uint8) noisy_image = np.clip(noisy_image.astype(np.int16) + noise, 0, 255).astype(np.uint8) images.append(noisy_image) return images def benchmark_pylaia(model_name="Teklia/pylaia-belfort", num_images=100): """Benchmark PyLaia with different chunk sizes.""" print(f"\n{'='*80}") print(f"PyLaia Chunking Performance Benchmark") print(f"{'='*80}") print(f"Model: {model_name}") print(f"Number of test images: {num_images}") # Initialize model print("\nInitializing model...") model = PyLaia(model_name) print(f"Device: {model.device}") # Create test images print(f"\nCreating {num_images} test images...") test_images = create_test_images(NORMAL_IMAGE_PATH, num_images) # Test different chunk sizes chunk_sizes = [1, 5, 10, 20, 50, 100] results = {} print(f"\n{'='*80}") print("Running benchmarks...") print(f"{'='*80}") for chunk_size in chunk_sizes: if chunk_size > num_images: continue print(f"\nTesting chunk_size={chunk_size}...") # Warm-up run print(" Warm-up run...") _ = model._predict(test_images[:min(5, num_images)], chunk_size=chunk_size) # Actual timing print(" Timing run...") start_time = time.time() predictions = model._predict( test_images, batch_size=8, temperature=1.0, chunk_size=chunk_size ) end_time = time.time() elapsed_time = end_time - start_time results[chunk_size] = { 'time': elapsed_time, 'images_per_second': num_images / elapsed_time, 'ms_per_image': (elapsed_time / num_images) * 1000, 'predictions': predictions } print(f" ✓ Completed in {elapsed_time:.2f}s") print(f" Speed: {results[chunk_size]['images_per_second']:.2f} images/second") print(f" Time per image: {results[chunk_size]['ms_per_image']:.2f}ms") # Print summary table print(f"\n{'='*80}") print("PERFORMANCE SUMMARY") print(f"{'='*80}") print(f"{'Chunk Size':>12} | {'Total Time':>10} | {'Images/sec':>12} | {'ms/image':>10} | {'Speedup':>10}") print(f"{'-'*12}-+-{'-'*10}-+-{'-'*12}-+-{'-'*10}-+-{'-'*10}") baseline_time = results[1]['time'] if 1 in results else list(results.values())[0]['time'] for chunk_size in sorted(results.keys()): data = results[chunk_size] speedup = baseline_time / data['time'] print(f"{chunk_size:>12} | {data['time']:>10.2f}s | {data['images_per_second']:>12.2f} | " f"{data['ms_per_image']:>10.2f} | {speedup:>10.2f}x") # Verify consistency print(f"\n{'='*80}") print("Verifying result consistency...") baseline_texts = [r.texts[0] for r in results[1]['predictions']] if 1 in results else None all_consistent = True for chunk_size, data in results.items(): if baseline_texts and chunk_size != 1: chunk_texts = [r.texts[0] for r in data['predictions']] if chunk_texts != baseline_texts: print(f" ✗ Results mismatch for chunk_size={chunk_size}") all_consistent = False if all_consistent: print(" ✓ All chunk sizes produced identical results") # Find optimal chunk size optimal_chunk = min(results.keys(), key=lambda k: results[k]['time']) optimal_speedup = baseline_time / results[optimal_chunk]['time'] print(f"\n{'='*80}") print(f"🚀 OPTIMAL CONFIGURATION") print(f"{'='*80}") print(f"Chunk size: {optimal_chunk}") print(f"Processing time: {results[optimal_chunk]['time']:.2f}s") print(f"Speed: {results[optimal_chunk]['images_per_second']:.2f} images/second") print(f"Speedup: {optimal_speedup:.2f}x") return results def quick_test(): """Quick test with 20 images for faster results.""" print("\n" + "="*80) print("QUICK TEST (20 images)") print("="*80) benchmark_pylaia(num_images=20) def full_test(): """Full test with 100 images.""" print("\n" + "="*80) print("FULL TEST (100 images)") print("="*80) benchmark_pylaia(num_images=100) if __name__ == "__main__": # Run quick test first quick_test() # Uncomment to run full test # full_test() # Or run with custom parameters # benchmark_pylaia(model_name="Teklia/pylaia-belfort", num_images=50)