kokluch commited on
Commit
d9f1916
·
1 Parent(s): 3a2b389

Add urlscan.io to classify as junk malicious urls.

Browse files
Files changed (4) hide show
  1. app.py +17 -0
  2. requirements.txt +9 -6
  3. url_tools.py +29 -0
  4. urlscan_client.py +146 -0
app.py CHANGED
@@ -1,11 +1,16 @@
 
1
  from fastapi import FastAPI
2
  from fastapi.responses import JSONResponse, FileResponse
3
  from pydantic import BaseModel
4
  from enum import Enum
5
  from transformers import pipeline
6
  from phishing_datasets import submit_entry
 
 
 
7
 
8
  app = FastAPI()
 
9
 
10
  class MessageModel(BaseModel):
11
  text: str
@@ -64,6 +69,18 @@ def get_robot_txt():
64
  @app.post("/predict")
65
  def predict(model: InputModel) -> OutputModel:
66
  text = model.query.message.text
 
 
 
 
 
 
 
 
 
 
 
 
67
  label = pipe(text)
68
  if label[0]['label'] == 'LABEL_1':
69
  submit_entry(model.query.sender, model.query.message.text)
 
1
+ import httpx
2
  from fastapi import FastAPI
3
  from fastapi.responses import JSONResponse, FileResponse
4
  from pydantic import BaseModel
5
  from enum import Enum
6
  from transformers import pipeline
7
  from phishing_datasets import submit_entry
8
+ from url_tools import extract_urls, resolve_short_url
9
+ from urlscan_client import UrlscanClient
10
+ import requests
11
 
12
  app = FastAPI()
13
+ urlscan = UrlscanClient()
14
 
15
  class MessageModel(BaseModel):
16
  text: str
 
69
  @app.post("/predict")
70
  def predict(model: InputModel) -> OutputModel:
71
  text = model.query.message.text
72
+
73
+ urls = extract_urls(text)
74
+ results = [urlscan.scan(url) for url in urls]
75
+
76
+ for result in results:
77
+ overall = result.get('verdicts', {}).get('overall', {})
78
+ print(f"Checking verdict: {overall}")
79
+ if overall.get('hasVerdicts') and overall.get('score') > 0:
80
+ print("Match found. Submitting entry and returning JUNK.")
81
+ submit_entry(model.query.sender, model.query.message.text)
82
+ return OutputModel(action=ActionModel.JUNK, sub_action=SubActionModel.NONE)
83
+
84
  label = pipe(text)
85
  if label[0]['label'] == 'LABEL_1':
86
  submit_entry(model.query.sender, model.query.message.text)
requirements.txt CHANGED
@@ -1,7 +1,10 @@
1
- fastapi
2
- uvicorn[standard]
3
- pydantic
4
- transformers
5
  torch
6
- datasets
7
- pandas
 
 
 
 
1
+ fastapi~=0.115.12
2
+ uvicorn[standard]~=0.34.2
3
+ pydantic~=2.11.4
4
+ transformers~=4.51.3
5
  torch
6
+ datasets~=3.6.0
7
+ pandas~=2.2.3
8
+ httpx~=0.28.1
9
+ numpy~=2.2.5
10
+ requests~=2.32.3
url_tools.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from urllib.parse import urlparse, urlunparse
3
+ import httpx
4
+
5
+ def extract_urls(text: str):
6
+ """Extract URLs from raw text."""
7
+ url_pattern = r"""(?:(?:https?:\/\/|www\.)?[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})(?:\/[^\s<>"']*)?"""
8
+ return re.findall(url_pattern, text)
9
+
10
+
11
+ def normalize_url(url: str) -> str:
12
+ """Ensure the URL has a scheme and is normalized."""
13
+ parsed = urlparse(url, scheme="http")
14
+ if not parsed.netloc:
15
+ parsed = urlparse("http://" + url)
16
+ return urlunparse(parsed)
17
+
18
+ def resolve_short_url(url: str) -> str:
19
+ """Make a HEAD request without following redirects, return the Location if redirected."""
20
+ url = normalize_url(url)
21
+ try:
22
+ with httpx.Client(follow_redirects=False, timeout=5) as client:
23
+ response = client.head(url, headers={"User-Agent": "Mozilla/5.0"})
24
+ if response.status_code in {301, 302, 303, 307, 308}:
25
+ return response.headers.get("location")
26
+ return url # No redirect
27
+ except httpx.RequestError as e:
28
+ print(f"Error: {e}")
29
+ return url
urlscan_client.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ UrlscanClient: A simple Python client for interacting with the urlscan.io API.
3
+
4
+ This client allows you to:
5
+ - Submit a URL to be scanned.
6
+ - Retrieve scan results by UUID.
7
+ - Search existing scans with a query.
8
+ - Perform a full scan workflow with error handling.
9
+
10
+ Environment Variable:
11
+ URLSCAN_API_KEY (str): If not passed during initialization, the client will attempt to use this environment variable.
12
+
13
+ Dependencies:
14
+ - requests
15
+ - os
16
+ - time
17
+ """
18
+
19
+ import os
20
+ import time
21
+ import requests
22
+
23
+ class UrlscanClient:
24
+ """
25
+ A client to interact with the urlscan.io API for submitting URLs, retrieving scan results,
26
+ and searching the scan database.
27
+ """
28
+ BASE_URL = "https://urlscan.io/api/v1"
29
+
30
+ def __init__(self, api_key=None):
31
+ """
32
+ Initialize the UrlscanClient.
33
+
34
+ Parameters:
35
+ api_key (str, optional): Your urlscan.io API key. If not provided, it is read from
36
+ the URLSCAN_API_KEY environment variable.
37
+
38
+ Raises:
39
+ ValueError: If the API key is not provided or found in environment variables.
40
+ """
41
+ self.api_key = api_key or os.getenv("URLSCAN_API_KEY")
42
+ if not self.api_key:
43
+ raise ValueError("API key is required. Set it via parameter or the URLSCAN_API_KEY environment variable.")
44
+ self.headers = {
45
+ "API-Key": self.api_key,
46
+ "Content-Type": "application/json"
47
+ }
48
+
49
+ def submit_url(self, url, visibility="public", tags=None, **options):
50
+ """
51
+ Submit a URL for scanning.
52
+
53
+ Parameters:
54
+ url (str): The URL to scan.
55
+ visibility (str): Scan visibility ('public', 'unlisted', or 'private'). Defaults to 'public'.
56
+ tags (list, optional): Optional list of tags to associate with the scan.
57
+ **options: Additional scan options like 'useragent', 'referer', 'country', etc.
58
+
59
+ Returns:
60
+ dict: JSON response from the submission API.
61
+
62
+ Raises:
63
+ requests.HTTPError: If the request fails.
64
+ """
65
+ payload = {
66
+ "url": url,
67
+ "visibility": visibility,
68
+ "country": "fr",
69
+ "tags": tags or []
70
+ }
71
+ payload.update(options)
72
+ response = requests.post(f"{self.BASE_URL}/scan/", headers=self.headers, json=payload)
73
+ response.raise_for_status()
74
+ return response.json()
75
+
76
+ def get_result(self, uuid, wait=True, timeout=60):
77
+ """
78
+ Retrieve the result of a scan by UUID.
79
+
80
+ Parameters:
81
+ uuid (str): The UUID of the scan result.
82
+ wait (bool): Whether to wait for the scan to complete if it's not yet ready. Defaults to True.
83
+ timeout (int): Maximum time (in seconds) to wait for the result if wait is True. Defaults to 60.
84
+
85
+ Returns:
86
+ dict: The scan result data.
87
+
88
+ Raises:
89
+ TimeoutError: If the result is not ready within the timeout period.
90
+ requests.HTTPError: If another HTTP error occurs.
91
+ """
92
+ result_url = f"{self.BASE_URL}/result/{uuid}/"
93
+ start_time = time.time()
94
+ while True:
95
+ response = requests.get(result_url, headers=self.headers)
96
+ if response.status_code == 200:
97
+ return response.json()
98
+ elif response.status_code == 404:
99
+ if not wait or (time.time() - start_time) > timeout:
100
+ raise TimeoutError("Scan result not available yet.")
101
+ time.sleep(5)
102
+ else:
103
+ response.raise_for_status()
104
+
105
+ def search(self, query, size=10):
106
+ """
107
+ Search for past scans using a query string.
108
+
109
+ Parameters:
110
+ query (str): The search query, such as a domain name or IP address.
111
+ size (int): Maximum number of results to return. Defaults to 10.
112
+
113
+ Returns:
114
+ dict: Search results from urlscan.io.
115
+
116
+ Raises:
117
+ requests.HTTPError: If the request fails.
118
+ """
119
+ params = {"q": query, "size": size}
120
+ response = requests.get(f"{self.BASE_URL}/search/", headers=self.headers, params=params)
121
+ response.raise_for_status()
122
+ return response.json()
123
+
124
+ def scan(self, url: str):
125
+ """
126
+ Convenience method to submit a scan and retrieve the result.
127
+
128
+ Parameters:
129
+ url (str): The URL to scan.
130
+
131
+ Returns:
132
+ dict: The scan result, or a fallback result if the scan fails.
133
+ """
134
+ try:
135
+ print(f"Submit url {url}")
136
+ submission = self.submit_url(url=url, visibility="public")
137
+ print(f"Submitted scan. UUID: {submission['uuid']}")
138
+ result = self.get_result(submission["uuid"])
139
+ print(f"Submission succeed. UUID: {submission['uuid']}")
140
+ return result
141
+ except requests.exceptions.RequestException as e:
142
+ print(f"Submission failed {e}")
143
+ return {
144
+ 'page': {'url': url},
145
+ 'verdicts': {'overall': {'hasVerdicts': False, 'score': 0}}
146
+ }