Spaces:
Running
Running
Add urlscan.io to classify as junk malicious urls.
Browse files- app.py +17 -0
- requirements.txt +9 -6
- url_tools.py +29 -0
- urlscan_client.py +146 -0
app.py
CHANGED
@@ -1,11 +1,16 @@
|
|
|
|
1 |
from fastapi import FastAPI
|
2 |
from fastapi.responses import JSONResponse, FileResponse
|
3 |
from pydantic import BaseModel
|
4 |
from enum import Enum
|
5 |
from transformers import pipeline
|
6 |
from phishing_datasets import submit_entry
|
|
|
|
|
|
|
7 |
|
8 |
app = FastAPI()
|
|
|
9 |
|
10 |
class MessageModel(BaseModel):
|
11 |
text: str
|
@@ -64,6 +69,18 @@ def get_robot_txt():
|
|
64 |
@app.post("/predict")
|
65 |
def predict(model: InputModel) -> OutputModel:
|
66 |
text = model.query.message.text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
label = pipe(text)
|
68 |
if label[0]['label'] == 'LABEL_1':
|
69 |
submit_entry(model.query.sender, model.query.message.text)
|
|
|
1 |
+
import httpx
|
2 |
from fastapi import FastAPI
|
3 |
from fastapi.responses import JSONResponse, FileResponse
|
4 |
from pydantic import BaseModel
|
5 |
from enum import Enum
|
6 |
from transformers import pipeline
|
7 |
from phishing_datasets import submit_entry
|
8 |
+
from url_tools import extract_urls, resolve_short_url
|
9 |
+
from urlscan_client import UrlscanClient
|
10 |
+
import requests
|
11 |
|
12 |
app = FastAPI()
|
13 |
+
urlscan = UrlscanClient()
|
14 |
|
15 |
class MessageModel(BaseModel):
|
16 |
text: str
|
|
|
69 |
@app.post("/predict")
|
70 |
def predict(model: InputModel) -> OutputModel:
|
71 |
text = model.query.message.text
|
72 |
+
|
73 |
+
urls = extract_urls(text)
|
74 |
+
results = [urlscan.scan(url) for url in urls]
|
75 |
+
|
76 |
+
for result in results:
|
77 |
+
overall = result.get('verdicts', {}).get('overall', {})
|
78 |
+
print(f"Checking verdict: {overall}")
|
79 |
+
if overall.get('hasVerdicts') and overall.get('score') > 0:
|
80 |
+
print("Match found. Submitting entry and returning JUNK.")
|
81 |
+
submit_entry(model.query.sender, model.query.message.text)
|
82 |
+
return OutputModel(action=ActionModel.JUNK, sub_action=SubActionModel.NONE)
|
83 |
+
|
84 |
label = pipe(text)
|
85 |
if label[0]['label'] == 'LABEL_1':
|
86 |
submit_entry(model.query.sender, model.query.message.text)
|
requirements.txt
CHANGED
@@ -1,7 +1,10 @@
|
|
1 |
-
fastapi
|
2 |
-
uvicorn[standard]
|
3 |
-
pydantic
|
4 |
-
transformers
|
5 |
torch
|
6 |
-
datasets
|
7 |
-
pandas
|
|
|
|
|
|
|
|
1 |
+
fastapi~=0.115.12
|
2 |
+
uvicorn[standard]~=0.34.2
|
3 |
+
pydantic~=2.11.4
|
4 |
+
transformers~=4.51.3
|
5 |
torch
|
6 |
+
datasets~=3.6.0
|
7 |
+
pandas~=2.2.3
|
8 |
+
httpx~=0.28.1
|
9 |
+
numpy~=2.2.5
|
10 |
+
requests~=2.32.3
|
url_tools.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from urllib.parse import urlparse, urlunparse
|
3 |
+
import httpx
|
4 |
+
|
5 |
+
def extract_urls(text: str):
|
6 |
+
"""Extract URLs from raw text."""
|
7 |
+
url_pattern = r"""(?:(?:https?:\/\/|www\.)?[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})(?:\/[^\s<>"']*)?"""
|
8 |
+
return re.findall(url_pattern, text)
|
9 |
+
|
10 |
+
|
11 |
+
def normalize_url(url: str) -> str:
|
12 |
+
"""Ensure the URL has a scheme and is normalized."""
|
13 |
+
parsed = urlparse(url, scheme="http")
|
14 |
+
if not parsed.netloc:
|
15 |
+
parsed = urlparse("http://" + url)
|
16 |
+
return urlunparse(parsed)
|
17 |
+
|
18 |
+
def resolve_short_url(url: str) -> str:
|
19 |
+
"""Make a HEAD request without following redirects, return the Location if redirected."""
|
20 |
+
url = normalize_url(url)
|
21 |
+
try:
|
22 |
+
with httpx.Client(follow_redirects=False, timeout=5) as client:
|
23 |
+
response = client.head(url, headers={"User-Agent": "Mozilla/5.0"})
|
24 |
+
if response.status_code in {301, 302, 303, 307, 308}:
|
25 |
+
return response.headers.get("location")
|
26 |
+
return url # No redirect
|
27 |
+
except httpx.RequestError as e:
|
28 |
+
print(f"Error: {e}")
|
29 |
+
return url
|
urlscan_client.py
ADDED
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
UrlscanClient: A simple Python client for interacting with the urlscan.io API.
|
3 |
+
|
4 |
+
This client allows you to:
|
5 |
+
- Submit a URL to be scanned.
|
6 |
+
- Retrieve scan results by UUID.
|
7 |
+
- Search existing scans with a query.
|
8 |
+
- Perform a full scan workflow with error handling.
|
9 |
+
|
10 |
+
Environment Variable:
|
11 |
+
URLSCAN_API_KEY (str): If not passed during initialization, the client will attempt to use this environment variable.
|
12 |
+
|
13 |
+
Dependencies:
|
14 |
+
- requests
|
15 |
+
- os
|
16 |
+
- time
|
17 |
+
"""
|
18 |
+
|
19 |
+
import os
|
20 |
+
import time
|
21 |
+
import requests
|
22 |
+
|
23 |
+
class UrlscanClient:
|
24 |
+
"""
|
25 |
+
A client to interact with the urlscan.io API for submitting URLs, retrieving scan results,
|
26 |
+
and searching the scan database.
|
27 |
+
"""
|
28 |
+
BASE_URL = "https://urlscan.io/api/v1"
|
29 |
+
|
30 |
+
def __init__(self, api_key=None):
|
31 |
+
"""
|
32 |
+
Initialize the UrlscanClient.
|
33 |
+
|
34 |
+
Parameters:
|
35 |
+
api_key (str, optional): Your urlscan.io API key. If not provided, it is read from
|
36 |
+
the URLSCAN_API_KEY environment variable.
|
37 |
+
|
38 |
+
Raises:
|
39 |
+
ValueError: If the API key is not provided or found in environment variables.
|
40 |
+
"""
|
41 |
+
self.api_key = api_key or os.getenv("URLSCAN_API_KEY")
|
42 |
+
if not self.api_key:
|
43 |
+
raise ValueError("API key is required. Set it via parameter or the URLSCAN_API_KEY environment variable.")
|
44 |
+
self.headers = {
|
45 |
+
"API-Key": self.api_key,
|
46 |
+
"Content-Type": "application/json"
|
47 |
+
}
|
48 |
+
|
49 |
+
def submit_url(self, url, visibility="public", tags=None, **options):
|
50 |
+
"""
|
51 |
+
Submit a URL for scanning.
|
52 |
+
|
53 |
+
Parameters:
|
54 |
+
url (str): The URL to scan.
|
55 |
+
visibility (str): Scan visibility ('public', 'unlisted', or 'private'). Defaults to 'public'.
|
56 |
+
tags (list, optional): Optional list of tags to associate with the scan.
|
57 |
+
**options: Additional scan options like 'useragent', 'referer', 'country', etc.
|
58 |
+
|
59 |
+
Returns:
|
60 |
+
dict: JSON response from the submission API.
|
61 |
+
|
62 |
+
Raises:
|
63 |
+
requests.HTTPError: If the request fails.
|
64 |
+
"""
|
65 |
+
payload = {
|
66 |
+
"url": url,
|
67 |
+
"visibility": visibility,
|
68 |
+
"country": "fr",
|
69 |
+
"tags": tags or []
|
70 |
+
}
|
71 |
+
payload.update(options)
|
72 |
+
response = requests.post(f"{self.BASE_URL}/scan/", headers=self.headers, json=payload)
|
73 |
+
response.raise_for_status()
|
74 |
+
return response.json()
|
75 |
+
|
76 |
+
def get_result(self, uuid, wait=True, timeout=60):
|
77 |
+
"""
|
78 |
+
Retrieve the result of a scan by UUID.
|
79 |
+
|
80 |
+
Parameters:
|
81 |
+
uuid (str): The UUID of the scan result.
|
82 |
+
wait (bool): Whether to wait for the scan to complete if it's not yet ready. Defaults to True.
|
83 |
+
timeout (int): Maximum time (in seconds) to wait for the result if wait is True. Defaults to 60.
|
84 |
+
|
85 |
+
Returns:
|
86 |
+
dict: The scan result data.
|
87 |
+
|
88 |
+
Raises:
|
89 |
+
TimeoutError: If the result is not ready within the timeout period.
|
90 |
+
requests.HTTPError: If another HTTP error occurs.
|
91 |
+
"""
|
92 |
+
result_url = f"{self.BASE_URL}/result/{uuid}/"
|
93 |
+
start_time = time.time()
|
94 |
+
while True:
|
95 |
+
response = requests.get(result_url, headers=self.headers)
|
96 |
+
if response.status_code == 200:
|
97 |
+
return response.json()
|
98 |
+
elif response.status_code == 404:
|
99 |
+
if not wait or (time.time() - start_time) > timeout:
|
100 |
+
raise TimeoutError("Scan result not available yet.")
|
101 |
+
time.sleep(5)
|
102 |
+
else:
|
103 |
+
response.raise_for_status()
|
104 |
+
|
105 |
+
def search(self, query, size=10):
|
106 |
+
"""
|
107 |
+
Search for past scans using a query string.
|
108 |
+
|
109 |
+
Parameters:
|
110 |
+
query (str): The search query, such as a domain name or IP address.
|
111 |
+
size (int): Maximum number of results to return. Defaults to 10.
|
112 |
+
|
113 |
+
Returns:
|
114 |
+
dict: Search results from urlscan.io.
|
115 |
+
|
116 |
+
Raises:
|
117 |
+
requests.HTTPError: If the request fails.
|
118 |
+
"""
|
119 |
+
params = {"q": query, "size": size}
|
120 |
+
response = requests.get(f"{self.BASE_URL}/search/", headers=self.headers, params=params)
|
121 |
+
response.raise_for_status()
|
122 |
+
return response.json()
|
123 |
+
|
124 |
+
def scan(self, url: str):
|
125 |
+
"""
|
126 |
+
Convenience method to submit a scan and retrieve the result.
|
127 |
+
|
128 |
+
Parameters:
|
129 |
+
url (str): The URL to scan.
|
130 |
+
|
131 |
+
Returns:
|
132 |
+
dict: The scan result, or a fallback result if the scan fails.
|
133 |
+
"""
|
134 |
+
try:
|
135 |
+
print(f"Submit url {url}")
|
136 |
+
submission = self.submit_url(url=url, visibility="public")
|
137 |
+
print(f"Submitted scan. UUID: {submission['uuid']}")
|
138 |
+
result = self.get_result(submission["uuid"])
|
139 |
+
print(f"Submission succeed. UUID: {submission['uuid']}")
|
140 |
+
return result
|
141 |
+
except requests.exceptions.RequestException as e:
|
142 |
+
print(f"Submission failed {e}")
|
143 |
+
return {
|
144 |
+
'page': {'url': url},
|
145 |
+
'verdicts': {'overall': {'hasVerdicts': False, 'score': 0}}
|
146 |
+
}
|