File size: 16,207 Bytes
2af0eb7
 
 
 
4779f10
2af0eb7
 
 
 
 
 
4779f10
 
 
 
2af0eb7
 
 
 
 
 
 
 
 
 
 
 
 
 
4779f10
2af0eb7
 
 
4779f10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2af0eb7
 
 
 
 
 
 
 
 
 
 
 
 
4779f10
2af0eb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4779f10
2af0eb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4779f10
 
 
2af0eb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4779f10
 
2af0eb7
 
 
 
 
 
 
 
4779f10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2af0eb7
 
 
4779f10
2af0eb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Visiting link:  https://www.ros.org/\n",
      "Visiting link:  https://docs.nav2.org/\n",
      "Visiting link:  https://moveit.ai/\n",
      "Visiting link:  https://gazebosim.org/home\n",
      "Visiting link:  https://github.com/ros2/ros2\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Cloning into 'ros2'...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Visiting link:  https://github.com/ros-navigation/navigation2\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Cloning into 'navigation2'...\n",
      "fatal: early EOF\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[10], line 35\u001b[0m\n\u001b[1;32m     33\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m     34\u001b[0m     os\u001b[38;5;241m.\u001b[39mchdir(local_temp)\n\u001b[0;32m---> 35\u001b[0m     \u001b[43msubprocess\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mgit\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mclone\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlink\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     36\u001b[0m     repo_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(local_temp, os\u001b[38;5;241m.\u001b[39mlistdir(local_temp)[\u001b[38;5;241m0\u001b[39m])\n\u001b[1;32m     37\u001b[0m     tree \u001b[38;5;241m=\u001b[39m {}\n",
      "File \u001b[0;32m/usr/local/lib/python3.12/subprocess.py:550\u001b[0m, in \u001b[0;36mrun\u001b[0;34m(input, capture_output, timeout, check, *popenargs, **kwargs)\u001b[0m\n\u001b[1;32m    548\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m Popen(\u001b[38;5;241m*\u001b[39mpopenargs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;28;01mas\u001b[39;00m process:\n\u001b[1;32m    549\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 550\u001b[0m         stdout, stderr \u001b[38;5;241m=\u001b[39m \u001b[43mprocess\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcommunicate\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    551\u001b[0m     \u001b[38;5;28;01mexcept\u001b[39;00m TimeoutExpired \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m    552\u001b[0m         process\u001b[38;5;241m.\u001b[39mkill()\n",
      "File \u001b[0;32m/usr/local/lib/python3.12/subprocess.py:1201\u001b[0m, in \u001b[0;36mPopen.communicate\u001b[0;34m(self, input, timeout)\u001b[0m\n\u001b[1;32m   1199\u001b[0m         stderr \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstderr\u001b[38;5;241m.\u001b[39mread()\n\u001b[1;32m   1200\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstderr\u001b[38;5;241m.\u001b[39mclose()\n\u001b[0;32m-> 1201\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1202\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m   1203\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
      "File \u001b[0;32m/usr/local/lib/python3.12/subprocess.py:1264\u001b[0m, in \u001b[0;36mPopen.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m   1262\u001b[0m     endtime \u001b[38;5;241m=\u001b[39m _time() \u001b[38;5;241m+\u001b[39m timeout\n\u001b[1;32m   1263\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1264\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_wait\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1265\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyboardInterrupt\u001b[39;00m:\n\u001b[1;32m   1266\u001b[0m     \u001b[38;5;66;03m# https://bugs.python.org/issue25942\u001b[39;00m\n\u001b[1;32m   1267\u001b[0m     \u001b[38;5;66;03m# The first keyboard interrupt waits briefly for the child to\u001b[39;00m\n\u001b[1;32m   1268\u001b[0m     \u001b[38;5;66;03m# exit under the common assumption that it also received the ^C\u001b[39;00m\n\u001b[1;32m   1269\u001b[0m     \u001b[38;5;66;03m# generated SIGINT and will exit rapidly.\u001b[39;00m\n\u001b[1;32m   1270\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
      "File \u001b[0;32m/usr/local/lib/python3.12/subprocess.py:2053\u001b[0m, in \u001b[0;36mPopen._wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m   2051\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturncode \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m   2052\u001b[0m     \u001b[38;5;28;01mbreak\u001b[39;00m  \u001b[38;5;66;03m# Another thread waited.\u001b[39;00m\n\u001b[0;32m-> 2053\u001b[0m (pid, sts) \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_try_wait\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2054\u001b[0m \u001b[38;5;66;03m# Check the pid and loop as waitpid has been known to\u001b[39;00m\n\u001b[1;32m   2055\u001b[0m \u001b[38;5;66;03m# return 0 even without WNOHANG in odd situations.\u001b[39;00m\n\u001b[1;32m   2056\u001b[0m \u001b[38;5;66;03m# http://bugs.python.org/issue14396.\u001b[39;00m\n\u001b[1;32m   2057\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m pid \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpid:\n",
      "File \u001b[0;32m/usr/local/lib/python3.12/subprocess.py:2011\u001b[0m, in \u001b[0;36mPopen._try_wait\u001b[0;34m(self, wait_flags)\u001b[0m\n\u001b[1;32m   2009\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"All callers to this function MUST hold self._waitpid_lock.\"\"\"\u001b[39;00m\n\u001b[1;32m   2010\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 2011\u001b[0m     (pid, sts) \u001b[38;5;241m=\u001b[39m \u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwaitpid\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpid\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mwait_flags\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2012\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mChildProcessError\u001b[39;00m:\n\u001b[1;32m   2013\u001b[0m     \u001b[38;5;66;03m# This happens if SIGCLD is set to be ignored or waiting\u001b[39;00m\n\u001b[1;32m   2014\u001b[0m     \u001b[38;5;66;03m# for child processes has otherwise been disabled for our\u001b[39;00m\n\u001b[1;32m   2015\u001b[0m     \u001b[38;5;66;03m# process.  This child is dead, we can't get the status.\u001b[39;00m\n\u001b[1;32m   2016\u001b[0m     pid \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpid\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "# See README for more info on how the DataCollectionPipeline works\n",
    "# The ETL pipeline is part of the DataCollectionPipeline\n",
    "# Remove the time.sleep(1) line if you are sure you won't get blocked from a webpage for requesting too often\n",
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "import time\n",
    "import os\n",
    "import shutil\n",
    "import subprocess\n",
    "import tempfile\n",
    "import urllib.parse\n",
    "from shared import getMongoClient\n",
    "\n",
    "# Input into the Data Collection Pipeline is a list of links to domains\n",
    "links = ['https://www.ros.org/','https://docs.nav2.org/','https://moveit.ai/','https://gazebosim.org/home', 'https://github.com/ros2/ros2', 'https://github.com/ros-navigation/navigation2', 'https://github.com/moveit/moveit2', 'https://github.com/gazebosim/gazebo-classic']\n",
    "\n",
    "# Create a mongoDB connection\n",
    "mongoHost =  getMongoClient()\n",
    "mongoDatabase =  mongoHost[\"twin\"]\n",
    "# ETL pipeline\n",
    "# Extract data from links and their subdirectories(using crawlers)\n",
    "codes = []\n",
    "for link in links:\n",
    "    # Web scraper/crawler for github links\n",
    "    if \"https://github.com\" in link:\n",
    "        # Do not revisit a link already in the database\n",
    "        mongoCollection = mongoDatabase[\"Github\"]\n",
    "        result = mongoCollection.find_one({\"link\": {\"$regex\" : link}})\n",
    "        if result is None:\n",
    "            print(\"Visiting link: \", link)\n",
    "            # Modified GithubCrawler from LLM-Engineer for scraping github\n",
    "            local_temp = tempfile.mkdtemp()\n",
    "            try:\n",
    "                os.chdir(local_temp)\n",
    "                subprocess.run([\"git\", \"clone\", link])\n",
    "                repo_path = os.path.join(local_temp, os.listdir(local_temp)[0])\n",
    "                tree = {}\n",
    "                for root, _, files in os.walk(repo_path):\n",
    "                    dir = root.replace(repo_path, \"\").lstrip(\"/\")\n",
    "                    if dir.startswith((\".git\", \".toml\", \".lock\", \".png\")):\n",
    "                        continue\n",
    "                    for file in files:\n",
    "                        if file.endswith((\".git\", \".toml\", \".lock\", \".png\")):\n",
    "                            continue\n",
    "                        file_path = os.path.join(dir, file)\n",
    "                        with open(\n",
    "                            os.path.join(root, file), \"r\", errors=\"ignore\"\n",
    "                        ) as f:\n",
    "                            tree[file_path] = f.read().replace(\" \", \"\")\n",
    "            except Exception:\n",
    "                print(f\"Error scrapping {link}\")\n",
    "            finally:\n",
    "                shutil.rmtree(local_temp)\n",
    "                # Correct the link\n",
    "                r = requests.get(link)\n",
    "                soup = BeautifulSoup(r.content, \"html.parser\")\n",
    "                # Find the file path to any of the files in the repository\n",
    "                link_element = soup.find(\"a\", attrs={\"class\": \"Link--primary\"})\n",
    "                path = link_element.get(\"href\")\n",
    "                path = path.rsplit(\"/\", 1)[0]\n",
    "                # Push all the subdirectories to mongo\n",
    "                for subdirectory in tree:\n",
    "                    #print(\n",
    "                    #    f\"Adding subdirectory: https://github.com{path}/{subdirectory}\"\n",
    "                    #)\n",
    "                    text = tree[subdirectory]\n",
    "                    # Transform the data\n",
    "                    # Get rid of repeating \\n characters and spaces\n",
    "                    text = text.replace(\"\\t\", \" \")\n",
    "                    text = text.replace(\"\\n\", \" \")\n",
    "                    text_len = len(text)\n",
    "                    for i in range(text_len):\n",
    "                        while i + 1 < text_len and text[i] == \" \" and text[i + 1] == \" \":\n",
    "                            text = text[:i] + text[i + 1 :]\n",
    "                            text_len -= 1\n",
    "                    codes.append(\n",
    "                        {\n",
    "                            \"link\": \"https://github.com\"\n",
    "                            + path\n",
    "                            + \"/\"\n",
    "                            + subdirectory,\n",
    "                            \"type\": \"Github\",\n",
    "                            \"content\": text,\n",
    "                        }\n",
    "                    )\n",
    "                mongoCollection.insert_many(codes)\n",
    "                codes = []\n",
    "        else:\n",
    "            print(\"Already visited: \", link)\n",
    "    # Web scraper/crawler for other links(Documents)\n",
    "    else:\n",
    "        # Do not revisit a link already in the database\n",
    "        mongoCollection = mongoDatabase[\"Document\"]\n",
    "        result = mongoCollection.find_one({\"link\": link})\n",
    "        if result is None:\n",
    "            print(\"Visiting link: \", link)\n",
    "            try:\n",
    "                # Get all text in the website\n",
    "                r = requests.get(link)\n",
    "                soup = BeautifulSoup(r.content, \"html.parser\")\n",
    "                soup.find_all([\"p\", \"h1\", \"h2\", \"h3\", \"h4\", \"h5\", \"h6\"])\n",
    "                text = soup.get_text()\n",
    "                # Transform the data\n",
    "                # Get rid of repeating \\n characters and spaces\n",
    "                text = text.replace(\"\\t\", \" \")\n",
    "                text = text.replace(\"\\n\", \" \")\n",
    "                text_len = len(text)\n",
    "                for i in range(text_len):\n",
    "                    while i + 1 < text_len and text[i] == \" \" and text[i + 1] == \" \":\n",
    "                        text = text[:i] + text[i + 1 :]\n",
    "                        text_len -= 1\n",
    "                if \"404\" not in text:\n",
    "                    mongoCollection.insert_one({\"link\": link, \"type\": \"Document\", \"content\": text})\n",
    "                else:\n",
    "                    print(\"Page not found: \", link)\n",
    "                # Also crawl through all subdirectorys in the link(related links)\n",
    "                soup = BeautifulSoup(r.content, \"html.parser\")\n",
    "                subdirectories = [a.get(\"href\") for a in soup.find_all(\"a\")]\n",
    "                for subdirectory in subdirectories:\n",
    "                    newLink = urllib.parse.urljoin(link, subdirectory)\n",
    "                    if (\n",
    "                        subdirectory is not None and\n",
    "                        'http' not in subdirectory and\n",
    "                        mongoCollection.find_one({\"link\": newLink}) is None\n",
    "                    ):\n",
    "                        #print(\"Adding subdirectory: \", link + subdirectory)\n",
    "                        links.append(newLink)\n",
    "            except:\n",
    "                print(\"Could not crawl link\", link)\n",
    "        else:\n",
    "            print(\"Already visited: \", link)\n",
    "    # Avoid spamming sites\n",
    "    time.sleep(0.1)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}