Spaces:
Running
Running
File size: 2,958 Bytes
58a226a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
import tqdm
from multiprocessing import Pool, cpu_count
import signal
import sys
import time
from flickrapi import FlickrAPI
# Add Flickr configuration
FLICKR_API_KEY = '80ef21a6f7eb0984ea613c316a89ca69'
FLICKR_API_SECRET = '4d0e8ce6734f4b3f'
flickr = FlickrAPI(FLICKR_API_KEY, FLICKR_API_SECRET, format='parsed-json', store_token=False)
def get_photo_id(url):
"""Extract photo ID from Flickr URL"""
try:
return url.split('/')[-1].split('_')[0]
except:
return None
def get_other_info(url):
"""Get author information from Flickr"""
try:
photo_id = get_photo_id(url)
if photo_id:
# wait for 0.1 second
time.sleep(0.1)
photo_info = flickr.photos.getInfo(photo_id=photo_id)
license = photo_info['photo']['license']
owner = photo_info['photo']['owner']
flickr_url = f"https://www.flickr.com/photos/{owner.get('nsid', '')}/{photo_id}"
return {
'username': owner.get('username', ''),
'realname': owner.get('realname', ''),
'nsid': owner.get('nsid', ''),
'flickr_url': flickr_url,
'license': license
}
except:
pass
return {
'username': 'Unknown',
'realname': 'Unknown',
'nsid': '',
'flickr_url': '',
'license': 'Unknown'
}
def init_worker():
"""Initialize worker process to handle signals"""
signal.signal(signal.SIGINT, signal.SIG_IGN)
def process_url(url):
try:
return get_other_info(url)
except Exception as e:
return {
'username': 'Error',
'realname': str(e),
'nsid': '',
'flickr_url': url,
'license': 'Unknown'
}
def process_urls_in_chunks(urls, chunk_size=100000):
authors = []
with Pool(cpu_count(), initializer=init_worker) as pool:
try:
# Process URLs in chunks
for i in range(0, len(urls), chunk_size):
chunk = urls[i:i + chunk_size]
chunk_results = list(tqdm.tqdm(
pool.imap(process_url, chunk),
total=len(chunk),
desc=f"Processing chunk {i//chunk_size + 1}"
))
authors.extend(chunk_results)
except KeyboardInterrupt:
pool.terminate()
pool.join()
print("\nProcessing interrupted by user")
sys.exit(1)
return authors
if __name__ == "__main__":
urls_file = "data/openimages_urls.txt"
with open(urls_file) as f:
urls = [url.strip() for url in f.readlines()][:100000]
authors = process_urls_in_chunks(urls)
# Count unique authors
unique_authors = len(set([author['username'] for author in authors]))
print(f"unique_authors: {unique_authors}")
print(f"Number of unique authors: {unique_authors}")
|