|
|
|
""" |
|
AI-Powered Conference Discovery System |
|
|
|
This script automatically discovers new AI conferences by: |
|
1. Scraping multiple reliable sources (WikiCFP, conference websites, etc.) |
|
2. Using AI models to categorize and extract conference details |
|
3. Validating and deduplicating against existing conferences |
|
4. Adding new conferences to conferences.yml |
|
""" |
|
|
|
import os |
|
import json |
|
import yaml |
|
import requests |
|
import time |
|
from datetime import datetime, timedelta |
|
from typing import Dict, List, Any, Optional, Tuple |
|
from dataclasses import dataclass |
|
from urllib.parse import urljoin, urlparse |
|
import re |
|
from bs4 import BeautifulSoup |
|
|
|
|
|
TARGET_CATEGORIES = { |
|
"machine-learning": ["machine learning", "ML", "artificial intelligence", "AI"], |
|
"lifelong-learning": ["lifelong learning", "continual learning", "incremental learning"], |
|
"robotics": ["robotics", "autonomous systems", "robot"], |
|
"computer-vision": ["computer vision", "CV", "image processing", "visual recognition"], |
|
"web-search": ["web search", "information retrieval", "search engines"], |
|
"data-mining": ["data mining", "knowledge discovery", "big data analytics"], |
|
"natural-language-processing": ["natural language processing", "NLP", "computational linguistics", "text mining"], |
|
"signal-processing": ["signal processing", "DSP", "audio processing", "speech"], |
|
"human-computer-interaction": ["HCI", "human computer interaction", "user interface", "UX"], |
|
"computer-graphics": ["computer graphics", "visualization", "rendering", "3D"], |
|
"mathematics": ["mathematics", "mathematical optimization", "numerical methods"], |
|
"reinforcement-learning": ["reinforcement learning", "RL", "deep RL", "multi-agent"] |
|
} |
|
|
|
@dataclass |
|
class ConferenceCandidate: |
|
"""Data class for discovered conference candidates""" |
|
title: str |
|
full_name: str = "" |
|
url: str = "" |
|
deadline: str = "" |
|
abstract_deadline: str = "" |
|
conference_date: str = "" |
|
location: str = "" |
|
city: str = "" |
|
country: str = "" |
|
description: str = "" |
|
tags: List[str] = None |
|
year: int = 0 |
|
confidence_score: float = 0.0 |
|
source: str = "" |
|
|
|
def __post_init__(self): |
|
if self.tags is None: |
|
self.tags = [] |
|
|
|
class ConferenceDiscoveryEngine: |
|
"""Main engine for discovering conferences using AI and web scraping""" |
|
|
|
def __init__(self, openai_api_key: Optional[str] = None): |
|
self.openai_api_key = openai_api_key or os.getenv('OPENAI_API_KEY') |
|
self.session = requests.Session() |
|
self.session.headers.update({ |
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' |
|
}) |
|
|
|
def discover_conferences(self) -> List[ConferenceCandidate]: |
|
"""Main method to discover conferences from multiple sources""" |
|
candidates = [] |
|
|
|
print("🔍 Starting AI-powered conference discovery...") |
|
|
|
|
|
print("📊 Scraping WikiCFP...") |
|
wikicfp_candidates = self._scrape_wikicfp() |
|
candidates.extend(wikicfp_candidates) |
|
|
|
|
|
print("🌐 Scraping popular AI deadline trackers...") |
|
deadline_sites_candidates = self._scrape_deadline_sites() |
|
candidates.extend(deadline_sites_candidates) |
|
|
|
|
|
print("🎓 Checking university AI department pages...") |
|
university_candidates = self._scrape_university_pages() |
|
candidates.extend(university_candidates) |
|
|
|
|
|
print("🤖 Using AI to analyze and categorize conferences...") |
|
enhanced_candidates = self._ai_enhance_candidates(candidates) |
|
|
|
|
|
print("✅ Filtering and validating candidates...") |
|
valid_candidates = self._filter_candidates(enhanced_candidates) |
|
|
|
print(f"🎉 Discovered {len(valid_candidates)} potential new conferences") |
|
return valid_candidates |
|
|
|
def _scrape_wikicfp(self) -> List[ConferenceCandidate]: |
|
"""Scrape WikiCFP for conference information""" |
|
candidates = [] |
|
base_url = "http://www.wikicfp.com/cfp/" |
|
|
|
|
|
for category, keywords in TARGET_CATEGORIES.items(): |
|
for keyword in keywords[:2]: |
|
try: |
|
search_url = f"{base_url}servlet/tool.search?q={keyword.replace(' ', '+')}&year=f" |
|
response = self._safe_request(search_url) |
|
if not response: |
|
continue |
|
|
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
conferences = self._parse_wikicfp_results(soup, category) |
|
candidates.extend(conferences) |
|
|
|
time.sleep(1) |
|
except Exception as e: |
|
print(f"Error scraping WikiCFP for {keyword}: {e}") |
|
|
|
return candidates |
|
|
|
def _parse_wikicfp_results(self, soup: BeautifulSoup, category: str) -> List[ConferenceCandidate]: |
|
"""Parse WikiCFP search results""" |
|
candidates = [] |
|
|
|
|
|
for row in soup.find_all('tr')[1:10]: |
|
cells = row.find_all('td') |
|
if len(cells) >= 4: |
|
try: |
|
title_cell = cells[0] |
|
deadline_cell = cells[1] |
|
location_cell = cells[2] |
|
|
|
title_link = title_cell.find('a') |
|
if title_link: |
|
title = title_link.get_text(strip=True) |
|
url = urljoin("http://www.wikicfp.com/cfp/", title_link.get('href', '')) |
|
|
|
candidate = ConferenceCandidate( |
|
title=title, |
|
url=url, |
|
deadline=deadline_cell.get_text(strip=True), |
|
location=location_cell.get_text(strip=True), |
|
tags=[category], |
|
source="WikiCFP" |
|
) |
|
|
|
|
|
self._enhance_from_wikicfp_page(candidate) |
|
candidates.append(candidate) |
|
|
|
except Exception as e: |
|
print(f"Error parsing WikiCFP row: {e}") |
|
continue |
|
|
|
return candidates |
|
|
|
def _enhance_from_wikicfp_page(self, candidate: ConferenceCandidate): |
|
"""Extract additional details from individual WikiCFP conference pages""" |
|
try: |
|
response = self._safe_request(candidate.url) |
|
if not response: |
|
return |
|
|
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
|
|
|
|
content = soup.find('div', class_='cfp') or soup.find('table') |
|
if content: |
|
text = content.get_text() |
|
|
|
|
|
date_pattern = r'Conference[:\s]*([A-Za-z]+ \d{1,2}[-–—]\d{1,2}, \d{4})' |
|
date_match = re.search(date_pattern, text) |
|
if date_match: |
|
candidate.conference_date = date_match.group(1) |
|
|
|
|
|
abstract_pattern = r'Abstract[:\s]*([A-Za-z]+ \d{1,2}, \d{4})' |
|
abstract_match = re.search(abstract_pattern, text) |
|
if abstract_match: |
|
candidate.abstract_deadline = abstract_match.group(1) |
|
|
|
|
|
location_pattern = r'Location[:\s]*([^.\n]+)' |
|
location_match = re.search(location_pattern, text) |
|
if location_match: |
|
candidate.location = location_match.group(1).strip() |
|
|
|
except Exception as e: |
|
print(f"Error enhancing WikiCFP page {candidate.url}: {e}") |
|
|
|
def _scrape_deadline_sites(self) -> List[ConferenceCandidate]: |
|
"""Scrape popular AI deadline tracking websites""" |
|
candidates = [] |
|
|
|
|
|
sites = [ |
|
"https://aideadlin.es/", |
|
"https://jackietseng.github.io/conference_call_for_paper/conferences.html" |
|
] |
|
|
|
for site_url in sites: |
|
try: |
|
response = self._safe_request(site_url) |
|
if response: |
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
site_candidates = self._parse_deadline_site(soup, site_url) |
|
candidates.extend(site_candidates) |
|
except Exception as e: |
|
print(f"Error scraping {site_url}: {e}") |
|
|
|
return candidates |
|
|
|
def _parse_deadline_site(self, soup: BeautifulSoup, source_url: str) -> List[ConferenceCandidate]: |
|
"""Parse deadline tracking websites for conference info""" |
|
candidates = [] |
|
|
|
|
|
conf_elements = (soup.find_all('div', class_='conf') + |
|
soup.find_all('tr') + |
|
soup.find_all('li')) |
|
|
|
for element in conf_elements[:20]: |
|
try: |
|
text = element.get_text(strip=True) |
|
if len(text) > 10 and any(keyword in text.lower() for keywords in TARGET_CATEGORIES.values() for keyword in keywords): |
|
|
|
|
|
title_match = re.search(r'([A-Z]{2,}[\w\s]*\d{4})', text) |
|
deadline_match = re.search(r'(\w+ \d{1,2}, \d{4})', text) |
|
|
|
if title_match: |
|
candidate = ConferenceCandidate( |
|
title=title_match.group(1), |
|
deadline=deadline_match.group(1) if deadline_match else "", |
|
source=f"DeadlineTracker-{urlparse(source_url).netloc}", |
|
description=text[:200] |
|
) |
|
candidates.append(candidate) |
|
|
|
except Exception as e: |
|
continue |
|
|
|
return candidates |
|
|
|
def _scrape_university_pages(self) -> List[ConferenceCandidate]: |
|
"""Scrape university AI department pages for conference announcements""" |
|
candidates = [] |
|
|
|
|
|
university_urls = [ |
|
"https://www.cs.stanford.edu/news/", |
|
"https://www.csail.mit.edu/news", |
|
"https://ai.berkeley.edu/news/", |
|
"https://www.cs.cmu.edu/news" |
|
] |
|
|
|
for url in university_urls: |
|
try: |
|
response = self._safe_request(url) |
|
if response: |
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
|
|
links = soup.find_all('a', href=True) |
|
for link in links[:10]: |
|
link_text = link.get_text(strip=True).lower() |
|
if ('conference' in link_text or 'cfp' in link_text or |
|
'call for papers' in link_text): |
|
|
|
|
|
pass |
|
except Exception as e: |
|
print(f"Error scraping {url}: {e}") |
|
|
|
return candidates |
|
|
|
def _ai_enhance_candidates(self, candidates: List[ConferenceCandidate]) -> List[ConferenceCandidate]: |
|
"""Use AI to enhance and categorize conference candidates""" |
|
if not self.openai_api_key: |
|
print("⚠️ No OpenAI API key found. Skipping AI enhancement.") |
|
return candidates |
|
|
|
enhanced = [] |
|
|
|
try: |
|
import openai |
|
openai.api_key = self.openai_api_key |
|
|
|
for candidate in candidates: |
|
try: |
|
|
|
prompt = f""" |
|
Analyze this conference information and provide structured data: |
|
|
|
Title: {candidate.title} |
|
Description: {candidate.description} |
|
Location: {candidate.location} |
|
Current Tags: {candidate.tags} |
|
|
|
Please provide: |
|
1. Most appropriate categories from: {list(TARGET_CATEGORIES.keys())} |
|
2. Confidence score (0-1) that this is a legitimate AI/CS conference |
|
3. Standardized full conference name |
|
4. Extracted city and country from location |
|
5. Year (if determinable) |
|
|
|
Respond in JSON format only. |
|
""" |
|
|
|
response = openai.ChatCompletion.create( |
|
model="gpt-3.5-turbo", |
|
messages=[{"role": "user", "content": prompt}], |
|
max_tokens=300, |
|
temperature=0.1 |
|
) |
|
|
|
ai_analysis = json.loads(response.choices[0].message.content) |
|
|
|
|
|
candidate.tags = ai_analysis.get('categories', candidate.tags) |
|
candidate.confidence_score = ai_analysis.get('confidence_score', 0.5) |
|
candidate.full_name = ai_analysis.get('full_name', candidate.title) |
|
candidate.city = ai_analysis.get('city', candidate.city) |
|
candidate.country = ai_analysis.get('country', candidate.country) |
|
candidate.year = ai_analysis.get('year', candidate.year) |
|
|
|
enhanced.append(candidate) |
|
|
|
time.sleep(0.5) |
|
|
|
except Exception as e: |
|
print(f"Error in AI analysis for {candidate.title}: {e}") |
|
enhanced.append(candidate) |
|
|
|
except ImportError: |
|
print("OpenAI package not available. Install with: pip install openai") |
|
return candidates |
|
|
|
return enhanced |
|
|
|
def _filter_candidates(self, candidates: List[ConferenceCandidate]) -> List[ConferenceCandidate]: |
|
"""Filter and validate conference candidates""" |
|
current_year = datetime.now().year |
|
next_year = current_year + 1 |
|
|
|
valid_candidates = [] |
|
|
|
for candidate in candidates: |
|
|
|
if (candidate.confidence_score >= 0.6 and |
|
len(candidate.title) >= 3 and |
|
candidate.tags and |
|
any(year in [current_year, next_year] for year in [candidate.year]) and |
|
candidate.title not in [existing['title'] for existing in self._load_existing_conferences()]): |
|
|
|
valid_candidates.append(candidate) |
|
|
|
return valid_candidates |
|
|
|
def _load_existing_conferences(self) -> List[Dict]: |
|
"""Load existing conferences to avoid duplicates""" |
|
try: |
|
with open('src/data/conferences.yml', 'r') as f: |
|
return yaml.safe_load(f) or [] |
|
except FileNotFoundError: |
|
return [] |
|
|
|
def _safe_request(self, url: str, timeout: int = 10) -> Optional[requests.Response]: |
|
"""Make a safe HTTP request with error handling""" |
|
try: |
|
response = self.session.get(url, timeout=timeout) |
|
response.raise_for_status() |
|
return response |
|
except Exception as e: |
|
print(f"Request failed for {url}: {e}") |
|
return None |
|
|
|
def add_to_conferences_yml(self, candidates: List[ConferenceCandidate]) -> int: |
|
"""Add validated candidates to conferences.yml""" |
|
if not candidates: |
|
return 0 |
|
|
|
|
|
existing_conferences = self._load_existing_conferences() |
|
|
|
added_count = 0 |
|
for candidate in candidates: |
|
|
|
conference_entry = { |
|
'title': candidate.title, |
|
'year': candidate.year or datetime.now().year + 1, |
|
'id': self._generate_conference_id(candidate.title, candidate.year), |
|
'full_name': candidate.full_name or candidate.title, |
|
'link': candidate.url, |
|
'deadline': self._parse_deadline(candidate.deadline), |
|
'timezone': 'AoE', |
|
'date': candidate.conference_date, |
|
'tags': candidate.tags, |
|
'city': candidate.city, |
|
'country': candidate.country, |
|
'note': f'Auto-discovered from {candidate.source}. Please verify details.' |
|
} |
|
|
|
|
|
if candidate.abstract_deadline: |
|
conference_entry['abstract_deadline'] = self._parse_deadline(candidate.abstract_deadline) |
|
|
|
existing_conferences.append(conference_entry) |
|
added_count += 1 |
|
|
|
|
|
existing_conferences.sort(key=lambda x: x.get('deadline', '9999')) |
|
|
|
|
|
with open('src/data/conferences.yml', 'w') as f: |
|
yaml.dump(existing_conferences, f, default_flow_style=False, sort_keys=False) |
|
|
|
return added_count |
|
|
|
def _generate_conference_id(self, title: str, year: int) -> str: |
|
"""Generate a unique conference ID""" |
|
|
|
words = title.split() |
|
if len(words) > 1: |
|
acronym = ''.join([word[0].lower() for word in words if word[0].isupper()]) |
|
if len(acronym) >= 2: |
|
return f"{acronym}{str(year)[-2:]}" |
|
|
|
|
|
clean_title = re.sub(r'[^a-zA-Z0-9]', '', title.lower()) |
|
return f"{clean_title[:6]}{str(year)[-2:]}" |
|
|
|
def _parse_deadline(self, deadline_str: str) -> str: |
|
"""Parse deadline string into standardized format""" |
|
if not deadline_str: |
|
return "" |
|
|
|
try: |
|
|
|
deadline_patterns = [ |
|
r'(\w+ \d{1,2}, \d{4})', |
|
r'(\d{4}-\d{2}-\d{2})', |
|
r'(\d{1,2}/\d{1,2}/\d{4})' |
|
] |
|
|
|
for pattern in deadline_patterns: |
|
match = re.search(pattern, deadline_str) |
|
if match: |
|
date_str = match.group(1) |
|
|
|
try: |
|
parsed_date = datetime.strptime(date_str, "%B %d, %Y") |
|
return parsed_date.strftime("%Y-%m-%d 23:59:59") |
|
except ValueError: |
|
try: |
|
parsed_date = datetime.strptime(date_str, "%Y-%m-%d") |
|
return parsed_date.strftime("%Y-%m-%d 23:59:59") |
|
except ValueError: |
|
continue |
|
|
|
return deadline_str |
|
|
|
except Exception: |
|
return deadline_str |
|
|
|
def main(): |
|
"""Main function to run conference discovery""" |
|
print("🚀 Starting AI-Powered Conference Discovery System") |
|
|
|
|
|
engine = ConferenceDiscoveryEngine() |
|
|
|
|
|
candidates = engine.discover_conferences() |
|
|
|
if candidates: |
|
print(f"\n📋 Found {len(candidates)} potential conferences:") |
|
for candidate in candidates: |
|
print(f" • {candidate.title} ({candidate.confidence_score:.2f} confidence) - {candidate.tags}") |
|
|
|
|
|
added_count = engine.add_to_conferences_yml(candidates) |
|
print(f"\n✅ Added {added_count} new conferences to conferences.yml") |
|
else: |
|
print("❌ No new conferences discovered") |
|
|
|
if __name__ == "__main__": |
|
main() |