File size: 2,958 Bytes
58a226a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import tqdm
from multiprocessing import Pool, cpu_count
import signal
import sys
import time

from flickrapi import FlickrAPI

# Add Flickr configuration
FLICKR_API_KEY = '80ef21a6f7eb0984ea613c316a89ca69'
FLICKR_API_SECRET = '4d0e8ce6734f4b3f'
flickr = FlickrAPI(FLICKR_API_KEY, FLICKR_API_SECRET, format='parsed-json', store_token=False)

def get_photo_id(url):
    """Extract photo ID from Flickr URL"""
    try:
        return url.split('/')[-1].split('_')[0]
    except:
        return None

def get_other_info(url):
    """Get author information from Flickr"""
    try:
        photo_id = get_photo_id(url)
        if photo_id:
            # wait for 0.1 second 
            time.sleep(0.1)
            photo_info = flickr.photos.getInfo(photo_id=photo_id)
            license = photo_info['photo']['license']
            owner = photo_info['photo']['owner']
            flickr_url = f"https://www.flickr.com/photos/{owner.get('nsid', '')}/{photo_id}"
            return {
                'username': owner.get('username', ''),
                'realname': owner.get('realname', ''),
                'nsid': owner.get('nsid', ''),
                'flickr_url': flickr_url,
                'license': license
            }
    except:
        pass
    return {
        'username': 'Unknown',
        'realname': 'Unknown',
        'nsid': '',
        'flickr_url': '',
        'license': 'Unknown'
    }

def init_worker():
    """Initialize worker process to handle signals"""
    signal.signal(signal.SIGINT, signal.SIG_IGN)

def process_url(url):
    try:
        return get_other_info(url)
    except Exception as e:
        return {
            'username': 'Error',
            'realname': str(e),
            'nsid': '',
            'flickr_url': url,
            'license': 'Unknown'
        }

def process_urls_in_chunks(urls, chunk_size=100000):
    authors = []
    with Pool(cpu_count(), initializer=init_worker) as pool:
        try:
            # Process URLs in chunks
            for i in range(0, len(urls), chunk_size):
                chunk = urls[i:i + chunk_size]
                chunk_results = list(tqdm.tqdm(
                    pool.imap(process_url, chunk),
                    total=len(chunk),
                    desc=f"Processing chunk {i//chunk_size + 1}"
                ))
                authors.extend(chunk_results)
        except KeyboardInterrupt:
            pool.terminate()
            pool.join()
            print("\nProcessing interrupted by user")
            sys.exit(1)
    return authors

if __name__ == "__main__":
    urls_file = "data/openimages_urls.txt"
    with open(urls_file) as f:
        urls = [url.strip() for url in f.readlines()][:100000]

    authors = process_urls_in_chunks(urls)
    
    # Count unique authors
    unique_authors = len(set([author['username'] for author in authors]))
    print(f"unique_authors: {unique_authors}")
    print(f"Number of unique authors: {unique_authors}")