File size: 3,491 Bytes
77f290b
 
08a78c1
77f290b
 
 
 
 
 
3cfadc8
8ac76ef
 
 
77f290b
8ac76ef
 
 
 
 
 
 
 
 
77f290b
8ac76ef
 
 
 
b048432
8ac76ef
77f290b
3cfadc8
 
 
 
 
77f290b
3cfadc8
77f290b
3cfadc8
 
 
 
 
 
 
77f290b
8ac76ef
 
7a8d600
8ac76ef
77f290b
8ac76ef
 
08a78c1
8ac76ef
 
 
 
 
 
 
 
 
 
 
 
 
77f290b
6e3c928
3cfadc8
77f290b
 
 
 
6e3c928
77f290b
 
 
3cfadc8
77f290b
 
 
 
69cbe77
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import pandas as pd
import os
from evaluations import documentation, requirements, training, validating, license, weights, pitfalls
from evaluations.utils import *
import zipfile
import os
import numpy as np
from huggingface_hub import InferenceClient

def evaluate(llm, verbose, repo_url, title=None, year=None, zip=None):
    try:
        if (not(llm)):
            log(verbose, "LOG", "No LLM will be used for the evaluation.")

        results = { "pred_live": "Yes", "pred_dependencies": None, "pred_training": None, "pred_evaluation": None, "pred_weights": None, "pred_readme": None, "pred_license": None, "pred_stars": None, "pred_citations": None, "pred_valid": False}
        
        if ((title != None) & (year != None) & (title != "") & (year != "")):
            res = fetch_openalex(verbose, title, year)
            if ((res != None)):
                res = res["results"]
                if (len(res) > 0):
                    res = res[0]
                    results["pred_citations"] = res["cited_by_count"]

        if (get_api_link(repo_url) != ""):
            results["pred_valid"] = True
        else:
            return results

        username, repo_name = decompose_url(repo_url)

        # If you don't provide a zip file, it will be fetched from github. For this, you need to provide a github token.
        if (zip is None):
            token = os.getenv("githubToken")
            repository_zip_name = "data/repo.zip"
            log(verbose, "LOG", f"Fetching github repository: https://github.com/{username}/{repo_name}")

            fetch_repo(verbose, repo_url, repository_zip_name, token)

            if (not(os.path.exists(repository_zip_name))):
                results["pred_live"] = "No"
                return results
            
            results["pred_stars"] = fetch_repo_stars(verbose, repo_url, token)

            zip = zipfile.ZipFile(repository_zip_name)

        readme = fetch_readme(zip)
        results["NA"] = documentation.is_applicable(verbose, llm, readme)

        results["pred_license"] = license.evaluate(verbose, llm, zip, readme)

        if (len(zip.namelist()) <= 2):
            log(verbose, "LOG", "The repository is empty.")
          
        results["pred_dependencies"] = requirements.evaluate(verbose, llm, zip, readme)
        results["pred_training"] = training.evaluate(verbose, llm, zip, readme)
        results["pred_evaluation"] = validating.evaluate(verbose, llm, zip, readme)
        results["pred_weights"] = weights.evaluate(verbose, llm, zip, readme)
        results["pred_readme"] = documentation.evaluate(verbose, llm, zip, readme)
        results["pred_codetocomment"] = documentation.get_code_to_comment_ratio(zip)
        pitfalls.evaluate(verbose, llm, zip, readme)

        return results
    except Exception as e:
        log(verbose, "ERROR", "Evaluating repository failed: " + str(e))
        results["pred_live"] = "No"
        return results

def full_evaluation():
  paper_dump = pd.read_csv("data/zipfiles.csv", sep="\t")
  full_results = []

  for idx, row in paper_dump.iterrows():

      if (pd.isna(row["url"]) | (row["url"] == "")):
          continue

      print(str(int(100 * idx / paper_dump["title"].count())) + "% done")
      result = evaluate(None, False, row["url"], row["title"], row["year"], zip=zipfile.ZipFile(row["zip_idx"]))
      for column in result.keys():
          row[column] = result[column]

      full_results.append(row)
  return pd.DataFrame(full_results)