ai-conference-deadlines / .github /scripts /ai_conference_discovery.py
scdrand23's picture
auto conf discovery
1beefeb
#!/usr/bin/env python3
"""
AI-Powered Conference Discovery System
This script automatically discovers new AI conferences by:
1. Scraping multiple reliable sources (WikiCFP, conference websites, etc.)
2. Using AI models to categorize and extract conference details
3. Validating and deduplicating against existing conferences
4. Adding new conferences to conferences.yml
"""
import os
import json
import yaml
import requests
import time
from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional, Tuple
from dataclasses import dataclass
from urllib.parse import urljoin, urlparse
import re
from bs4 import BeautifulSoup
# Configuration for target categories
TARGET_CATEGORIES = {
"machine-learning": ["machine learning", "ML", "artificial intelligence", "AI"],
"lifelong-learning": ["lifelong learning", "continual learning", "incremental learning"],
"robotics": ["robotics", "autonomous systems", "robot"],
"computer-vision": ["computer vision", "CV", "image processing", "visual recognition"],
"web-search": ["web search", "information retrieval", "search engines"],
"data-mining": ["data mining", "knowledge discovery", "big data analytics"],
"natural-language-processing": ["natural language processing", "NLP", "computational linguistics", "text mining"],
"signal-processing": ["signal processing", "DSP", "audio processing", "speech"],
"human-computer-interaction": ["HCI", "human computer interaction", "user interface", "UX"],
"computer-graphics": ["computer graphics", "visualization", "rendering", "3D"],
"mathematics": ["mathematics", "mathematical optimization", "numerical methods"],
"reinforcement-learning": ["reinforcement learning", "RL", "deep RL", "multi-agent"]
}
@dataclass
class ConferenceCandidate:
"""Data class for discovered conference candidates"""
title: str
full_name: str = ""
url: str = ""
deadline: str = ""
abstract_deadline: str = ""
conference_date: str = ""
location: str = ""
city: str = ""
country: str = ""
description: str = ""
tags: List[str] = None
year: int = 0
confidence_score: float = 0.0
source: str = ""
def __post_init__(self):
if self.tags is None:
self.tags = []
class ConferenceDiscoveryEngine:
"""Main engine for discovering conferences using AI and web scraping"""
def __init__(self, openai_api_key: Optional[str] = None):
self.openai_api_key = openai_api_key or os.getenv('OPENAI_API_KEY')
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
def discover_conferences(self) -> List[ConferenceCandidate]:
"""Main method to discover conferences from multiple sources"""
candidates = []
print("🔍 Starting AI-powered conference discovery...")
# Source 1: WikiCFP
print("📊 Scraping WikiCFP...")
wikicfp_candidates = self._scrape_wikicfp()
candidates.extend(wikicfp_candidates)
# Source 2: AI Conference deadlines websites
print("🌐 Scraping popular AI deadline trackers...")
deadline_sites_candidates = self._scrape_deadline_sites()
candidates.extend(deadline_sites_candidates)
# Source 3: University AI department pages
print("🎓 Checking university AI department pages...")
university_candidates = self._scrape_university_pages()
candidates.extend(university_candidates)
# Use AI to enhance and categorize candidates
print("🤖 Using AI to analyze and categorize conferences...")
enhanced_candidates = self._ai_enhance_candidates(candidates)
# Filter and validate
print("✅ Filtering and validating candidates...")
valid_candidates = self._filter_candidates(enhanced_candidates)
print(f"🎉 Discovered {len(valid_candidates)} potential new conferences")
return valid_candidates
def _scrape_wikicfp(self) -> List[ConferenceCandidate]:
"""Scrape WikiCFP for conference information"""
candidates = []
base_url = "http://www.wikicfp.com/cfp/"
# Search for conferences in our target categories
for category, keywords in TARGET_CATEGORIES.items():
for keyword in keywords[:2]: # Limit to avoid overwhelming
try:
search_url = f"{base_url}servlet/tool.search?q={keyword.replace(' ', '+')}&year=f"
response = self._safe_request(search_url)
if not response:
continue
soup = BeautifulSoup(response.text, 'html.parser')
conferences = self._parse_wikicfp_results(soup, category)
candidates.extend(conferences)
time.sleep(1) # Be respectful
except Exception as e:
print(f"Error scraping WikiCFP for {keyword}: {e}")
return candidates
def _parse_wikicfp_results(self, soup: BeautifulSoup, category: str) -> List[ConferenceCandidate]:
"""Parse WikiCFP search results"""
candidates = []
# WikiCFP results are typically in tables
for row in soup.find_all('tr')[1:10]: # Skip header, limit results
cells = row.find_all('td')
if len(cells) >= 4:
try:
title_cell = cells[0]
deadline_cell = cells[1]
location_cell = cells[2]
title_link = title_cell.find('a')
if title_link:
title = title_link.get_text(strip=True)
url = urljoin("http://www.wikicfp.com/cfp/", title_link.get('href', ''))
candidate = ConferenceCandidate(
title=title,
url=url,
deadline=deadline_cell.get_text(strip=True),
location=location_cell.get_text(strip=True),
tags=[category],
source="WikiCFP"
)
# Extract more details from the conference page
self._enhance_from_wikicfp_page(candidate)
candidates.append(candidate)
except Exception as e:
print(f"Error parsing WikiCFP row: {e}")
continue
return candidates
def _enhance_from_wikicfp_page(self, candidate: ConferenceCandidate):
"""Extract additional details from individual WikiCFP conference pages"""
try:
response = self._safe_request(candidate.url)
if not response:
return
soup = BeautifulSoup(response.text, 'html.parser')
# Extract conference details
content = soup.find('div', class_='cfp') or soup.find('table')
if content:
text = content.get_text()
# Extract conference dates
date_pattern = r'Conference[:\s]*([A-Za-z]+ \d{1,2}[-–—]\d{1,2}, \d{4})'
date_match = re.search(date_pattern, text)
if date_match:
candidate.conference_date = date_match.group(1)
# Extract abstract deadline
abstract_pattern = r'Abstract[:\s]*([A-Za-z]+ \d{1,2}, \d{4})'
abstract_match = re.search(abstract_pattern, text)
if abstract_match:
candidate.abstract_deadline = abstract_match.group(1)
# Extract location details
location_pattern = r'Location[:\s]*([^.\n]+)'
location_match = re.search(location_pattern, text)
if location_match:
candidate.location = location_match.group(1).strip()
except Exception as e:
print(f"Error enhancing WikiCFP page {candidate.url}: {e}")
def _scrape_deadline_sites(self) -> List[ConferenceCandidate]:
"""Scrape popular AI deadline tracking websites"""
candidates = []
# Popular deadline tracking sites
sites = [
"https://aideadlin.es/",
"https://jackietseng.github.io/conference_call_for_paper/conferences.html"
]
for site_url in sites:
try:
response = self._safe_request(site_url)
if response:
soup = BeautifulSoup(response.text, 'html.parser')
site_candidates = self._parse_deadline_site(soup, site_url)
candidates.extend(site_candidates)
except Exception as e:
print(f"Error scraping {site_url}: {e}")
return candidates
def _parse_deadline_site(self, soup: BeautifulSoup, source_url: str) -> List[ConferenceCandidate]:
"""Parse deadline tracking websites for conference info"""
candidates = []
# Look for conference entries (this will vary by site structure)
conf_elements = (soup.find_all('div', class_='conf') +
soup.find_all('tr') +
soup.find_all('li'))
for element in conf_elements[:20]: # Limit results
try:
text = element.get_text(strip=True)
if len(text) > 10 and any(keyword in text.lower() for keywords in TARGET_CATEGORIES.values() for keyword in keywords):
# Extract conference name and deadline
title_match = re.search(r'([A-Z]{2,}[\w\s]*\d{4})', text)
deadline_match = re.search(r'(\w+ \d{1,2}, \d{4})', text)
if title_match:
candidate = ConferenceCandidate(
title=title_match.group(1),
deadline=deadline_match.group(1) if deadline_match else "",
source=f"DeadlineTracker-{urlparse(source_url).netloc}",
description=text[:200]
)
candidates.append(candidate)
except Exception as e:
continue
return candidates
def _scrape_university_pages(self) -> List[ConferenceCandidate]:
"""Scrape university AI department pages for conference announcements"""
candidates = []
# Major AI research institutions
university_urls = [
"https://www.cs.stanford.edu/news/",
"https://www.csail.mit.edu/news",
"https://ai.berkeley.edu/news/",
"https://www.cs.cmu.edu/news"
]
for url in university_urls:
try:
response = self._safe_request(url)
if response:
soup = BeautifulSoup(response.text, 'html.parser')
# Look for conference-related announcements
links = soup.find_all('a', href=True)
for link in links[:10]:
link_text = link.get_text(strip=True).lower()
if ('conference' in link_text or 'cfp' in link_text or
'call for papers' in link_text):
# This is a potential conference announcement
# You would extract more details here
pass
except Exception as e:
print(f"Error scraping {url}: {e}")
return candidates
def _ai_enhance_candidates(self, candidates: List[ConferenceCandidate]) -> List[ConferenceCandidate]:
"""Use AI to enhance and categorize conference candidates"""
if not self.openai_api_key:
print("⚠️ No OpenAI API key found. Skipping AI enhancement.")
return candidates
enhanced = []
try:
import openai
openai.api_key = self.openai_api_key
for candidate in candidates:
try:
# Create a prompt for the AI to analyze the conference
prompt = f"""
Analyze this conference information and provide structured data:
Title: {candidate.title}
Description: {candidate.description}
Location: {candidate.location}
Current Tags: {candidate.tags}
Please provide:
1. Most appropriate categories from: {list(TARGET_CATEGORIES.keys())}
2. Confidence score (0-1) that this is a legitimate AI/CS conference
3. Standardized full conference name
4. Extracted city and country from location
5. Year (if determinable)
Respond in JSON format only.
"""
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": prompt}],
max_tokens=300,
temperature=0.1
)
ai_analysis = json.loads(response.choices[0].message.content)
# Update candidate with AI insights
candidate.tags = ai_analysis.get('categories', candidate.tags)
candidate.confidence_score = ai_analysis.get('confidence_score', 0.5)
candidate.full_name = ai_analysis.get('full_name', candidate.title)
candidate.city = ai_analysis.get('city', candidate.city)
candidate.country = ai_analysis.get('country', candidate.country)
candidate.year = ai_analysis.get('year', candidate.year)
enhanced.append(candidate)
time.sleep(0.5) # Rate limiting
except Exception as e:
print(f"Error in AI analysis for {candidate.title}: {e}")
enhanced.append(candidate) # Add without enhancement
except ImportError:
print("OpenAI package not available. Install with: pip install openai")
return candidates
return enhanced
def _filter_candidates(self, candidates: List[ConferenceCandidate]) -> List[ConferenceCandidate]:
"""Filter and validate conference candidates"""
current_year = datetime.now().year
next_year = current_year + 1
valid_candidates = []
for candidate in candidates:
# Basic validation criteria
if (candidate.confidence_score >= 0.6 and # AI confidence threshold
len(candidate.title) >= 3 and # Reasonable title length
candidate.tags and # Has categories
any(year in [current_year, next_year] for year in [candidate.year]) and # Current/next year
candidate.title not in [existing['title'] for existing in self._load_existing_conferences()]): # Not duplicate
valid_candidates.append(candidate)
return valid_candidates
def _load_existing_conferences(self) -> List[Dict]:
"""Load existing conferences to avoid duplicates"""
try:
with open('src/data/conferences.yml', 'r') as f:
return yaml.safe_load(f) or []
except FileNotFoundError:
return []
def _safe_request(self, url: str, timeout: int = 10) -> Optional[requests.Response]:
"""Make a safe HTTP request with error handling"""
try:
response = self.session.get(url, timeout=timeout)
response.raise_for_status()
return response
except Exception as e:
print(f"Request failed for {url}: {e}")
return None
def add_to_conferences_yml(self, candidates: List[ConferenceCandidate]) -> int:
"""Add validated candidates to conferences.yml"""
if not candidates:
return 0
# Load existing conferences
existing_conferences = self._load_existing_conferences()
added_count = 0
for candidate in candidates:
# Convert to conference format
conference_entry = {
'title': candidate.title,
'year': candidate.year or datetime.now().year + 1,
'id': self._generate_conference_id(candidate.title, candidate.year),
'full_name': candidate.full_name or candidate.title,
'link': candidate.url,
'deadline': self._parse_deadline(candidate.deadline),
'timezone': 'AoE', # Default timezone
'date': candidate.conference_date,
'tags': candidate.tags,
'city': candidate.city,
'country': candidate.country,
'note': f'Auto-discovered from {candidate.source}. Please verify details.'
}
# Add abstract deadline if available
if candidate.abstract_deadline:
conference_entry['abstract_deadline'] = self._parse_deadline(candidate.abstract_deadline)
existing_conferences.append(conference_entry)
added_count += 1
# Sort conferences by deadline
existing_conferences.sort(key=lambda x: x.get('deadline', '9999'))
# Write back to file
with open('src/data/conferences.yml', 'w') as f:
yaml.dump(existing_conferences, f, default_flow_style=False, sort_keys=False)
return added_count
def _generate_conference_id(self, title: str, year: int) -> str:
"""Generate a unique conference ID"""
# Extract acronym or use first few letters
words = title.split()
if len(words) > 1:
acronym = ''.join([word[0].lower() for word in words if word[0].isupper()])
if len(acronym) >= 2:
return f"{acronym}{str(year)[-2:]}"
# Fallback to first few letters + year
clean_title = re.sub(r'[^a-zA-Z0-9]', '', title.lower())
return f"{clean_title[:6]}{str(year)[-2:]}"
def _parse_deadline(self, deadline_str: str) -> str:
"""Parse deadline string into standardized format"""
if not deadline_str:
return ""
try:
# Try to parse various deadline formats
deadline_patterns = [
r'(\w+ \d{1,2}, \d{4})',
r'(\d{4}-\d{2}-\d{2})',
r'(\d{1,2}/\d{1,2}/\d{4})'
]
for pattern in deadline_patterns:
match = re.search(pattern, deadline_str)
if match:
date_str = match.group(1)
# Convert to standardized format (YYYY-MM-DD HH:MM:SS)
try:
parsed_date = datetime.strptime(date_str, "%B %d, %Y")
return parsed_date.strftime("%Y-%m-%d 23:59:59")
except ValueError:
try:
parsed_date = datetime.strptime(date_str, "%Y-%m-%d")
return parsed_date.strftime("%Y-%m-%d 23:59:59")
except ValueError:
continue
return deadline_str # Return as-is if parsing fails
except Exception:
return deadline_str
def main():
"""Main function to run conference discovery"""
print("🚀 Starting AI-Powered Conference Discovery System")
# Initialize the discovery engine
engine = ConferenceDiscoveryEngine()
# Discover conferences
candidates = engine.discover_conferences()
if candidates:
print(f"\n📋 Found {len(candidates)} potential conferences:")
for candidate in candidates:
print(f" • {candidate.title} ({candidate.confidence_score:.2f} confidence) - {candidate.tags}")
# Add to conferences.yml
added_count = engine.add_to_conferences_yml(candidates)
print(f"\n✅ Added {added_count} new conferences to conferences.yml")
else:
print("❌ No new conferences discovered")
if __name__ == "__main__":
main()