Spaces:
Sleeping
Sleeping
File size: 3,491 Bytes
77f290b 08a78c1 77f290b 3cfadc8 8ac76ef 77f290b 8ac76ef 77f290b 8ac76ef b048432 8ac76ef 77f290b 3cfadc8 77f290b 3cfadc8 77f290b 3cfadc8 77f290b 8ac76ef 7a8d600 8ac76ef 77f290b 8ac76ef 08a78c1 8ac76ef 77f290b 6e3c928 3cfadc8 77f290b 6e3c928 77f290b 3cfadc8 77f290b 69cbe77 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
import pandas as pd
import os
from evaluations import documentation, requirements, training, validating, license, weights, pitfalls
from evaluations.utils import *
import zipfile
import os
import numpy as np
from huggingface_hub import InferenceClient
def evaluate(llm, verbose, repo_url, title=None, year=None, zip=None):
try:
if (not(llm)):
log(verbose, "LOG", "No LLM will be used for the evaluation.")
results = { "pred_live": "Yes", "pred_dependencies": None, "pred_training": None, "pred_evaluation": None, "pred_weights": None, "pred_readme": None, "pred_license": None, "pred_stars": None, "pred_citations": None, "pred_valid": False}
if ((title != None) & (year != None) & (title != "") & (year != "")):
res = fetch_openalex(verbose, title, year)
if ((res != None)):
res = res["results"]
if (len(res) > 0):
res = res[0]
results["pred_citations"] = res["cited_by_count"]
if (get_api_link(repo_url) != ""):
results["pred_valid"] = True
else:
return results
username, repo_name = decompose_url(repo_url)
# If you don't provide a zip file, it will be fetched from github. For this, you need to provide a github token.
if (zip is None):
token = os.getenv("githubToken")
repository_zip_name = "data/repo.zip"
log(verbose, "LOG", f"Fetching github repository: https://github.com/{username}/{repo_name}")
fetch_repo(verbose, repo_url, repository_zip_name, token)
if (not(os.path.exists(repository_zip_name))):
results["pred_live"] = "No"
return results
results["pred_stars"] = fetch_repo_stars(verbose, repo_url, token)
zip = zipfile.ZipFile(repository_zip_name)
readme = fetch_readme(zip)
results["NA"] = documentation.is_applicable(verbose, llm, readme)
results["pred_license"] = license.evaluate(verbose, llm, zip, readme)
if (len(zip.namelist()) <= 2):
log(verbose, "LOG", "The repository is empty.")
results["pred_dependencies"] = requirements.evaluate(verbose, llm, zip, readme)
results["pred_training"] = training.evaluate(verbose, llm, zip, readme)
results["pred_evaluation"] = validating.evaluate(verbose, llm, zip, readme)
results["pred_weights"] = weights.evaluate(verbose, llm, zip, readme)
results["pred_readme"] = documentation.evaluate(verbose, llm, zip, readme)
results["pred_codetocomment"] = documentation.get_code_to_comment_ratio(zip)
pitfalls.evaluate(verbose, llm, zip, readme)
return results
except Exception as e:
log(verbose, "ERROR", "Evaluating repository failed: " + str(e))
results["pred_live"] = "No"
return results
def full_evaluation():
paper_dump = pd.read_csv("data/zipfiles.csv", sep="\t")
full_results = []
for idx, row in paper_dump.iterrows():
if (pd.isna(row["url"]) | (row["url"] == "")):
continue
print(str(int(100 * idx / paper_dump["title"].count())) + "% done")
result = evaluate(None, False, row["url"], row["title"], row["year"], zip=zipfile.ZipFile(row["zip_idx"]))
for column in result.keys():
row[column] = result[column]
full_results.append(row)
return pd.DataFrame(full_results)
|