Skip to content

Commit a4a0f1a

Browse files
authored
Merge pull request #202 from DedSecInside/simplify_link_code
Simplify LinkNode and add new display
2 parents 8ca1995 + a8ae3cd commit a4a0f1a

7 files changed

Lines changed: 146 additions & 285 deletions

File tree

src/modules/analyzer.py

Lines changed: 13 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,12 @@
33
"""
44
from requests.exceptions import HTTPError
55

6-
from ete3 import Tree, TreeStyle, TextFace, add_face_to_node
7-
from .link import LinkNode
8-
from .utils import multi_thread
6+
from ete3 import faces, Tree, TreeStyle, TextFace, add_face_to_node
97

108

119
def default_layout(node):
1210
node_style = TextFace(node.name, tight_text=True)
13-
add_face_to_node(node_style, node, column=0, position='branch-bottom')
11+
faces.add_face_to_node(node_style, node, column=0, position='branch-bottom')
1412

1513

1614
default_style = TreeStyle()
@@ -51,6 +49,7 @@ def save(self, file_name, tree_style=default_style):
5149
file_name (str): Name of file being saved to
5250
tree_style (TreeStyle): Styling of downloaded tree
5351
"""
52+
self._tree.layout_fn = default_layout
5453
self._tree.render(file_name, tree_style)
5554

5655
def show(self, tree_style=default_style):
@@ -60,39 +59,35 @@ def show(self, tree_style=default_style):
6059
Args:
6160
tree_style (TreeStyle): Styling of downloaded tree
6261
"""
63-
self._tree.show(tree_style)
62+
self._tree.layout_fn = default_layout
63+
self._tree.show(tree_style=tree_style)
6464

6565

66-
def build_tree(link, stop=1, rec=0):
66+
def build_tree(node, stop=1, rec=0):
6767
"""
6868
Builds link tree by traversing through children nodes.
6969
7070
Args:
71-
link (LinkNode): root node of tree
71+
node (LinkNode): root node of tree
7272
stop (int): depth of tree
7373
rec (int): level of recursion
7474
7575
Returns:
7676
tree (ete3.Tree): Built tree.
7777
"""
7878

79-
tree = Tree(name=link.name)
79+
print('Adding node for: ', node.get_name())
80+
tree = Tree(name=node.get_name())
8081

8182
if rec == stop:
8283
return tree
8384
else:
8485
rec += 1
8586

86-
for child in link.links:
87-
try:
88-
node = LinkNode(child)
89-
except Exception as error:
90-
print(f"Failed to create LinkNode for link: {child}.")
91-
print(f"Error: {error}")
92-
continue
93-
if node.links:
94-
tree.add_child(build_tree(node, stop, rec))
87+
for child in node.get_children():
88+
if child.get_children():
89+
tree.add_child(build_tree(child, stop, rec))
9590
else:
96-
tree.add_child(Tree(name=node.name))
91+
tree.add_child(Tree(name=child.get_name()))
9792

9893
return tree

src/modules/collect_data.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,12 @@
88

99
from bs4 import BeautifulSoup
1010
from dotenv import load_dotenv
11-
from .link import LinkNode
12-
from .utils import multi_thread
1311
from .utils import find_file
1412
from threadsafe.safe_csv import SafeDictWriter
1513
from progress.bar import Bar
1614

15+
from .validators import validate_link
16+
1717

1818
dev_file = find_file("torbot_dev.env", "../")
1919
if not dev_file:
@@ -31,7 +31,7 @@ def parse_links(html):
3131
"""
3232
soup = BeautifulSoup(html, 'html.parser')
3333
tags = soup.find_all('a')
34-
return [tag['href'] for tag in tags if LinkNode.valid_link(tag['href'])]
34+
return [tag['href'] for tag in tags if validate_link(tag['href'])]
3535

3636

3737
def parse_meta_tags(soup):

src/modules/info.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,7 @@
99
from re import search, findall
1010
from requests.exceptions import HTTPError
1111
import requests
12-
from requests import get
1312
import re
14-
from .link_io import LinkIO
1513

1614

1715
def execute_all(link, *, display_status=False):
@@ -40,9 +38,8 @@ def execute_all(link, *, display_status=False):
4038
bad_scripts = set() # unclean javascript file urls
4139
datasets = [files, intel, robots, custom, failed, scripts, external, fuzzable, endpoints, keys]
4240
dataset_names = ['files', 'intel', 'robots', 'custom', 'failed', 'scripts', 'external', 'fuzzable', 'endpoints', 'keys']
43-
page, response = LinkIO.read(link, response=True, show_msg=display_status)
44-
response = get(link, verify=False).text
45-
soup = BeautifulSoup(page, 'html.parser')
41+
response = requests.get(link)
42+
soup = BeautifulSoup(response.text, 'html.parser')
4643
validation_functions = [get_robots_txt, get_dot_git, get_dot_svn, get_dot_git, get_intel, get_bitcoin_address]
4744
for validate_func in validation_functions:
4845
try:

src/modules/link.py

Lines changed: 67 additions & 147 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,12 @@
22
This module is used to create a LinkNode that can be consumued by a LinkTree
33
and contains useful Link methods.
44
"""
5-
import requests
6-
import requests.exceptions
7-
import validators
85
import re
6+
import requests
97
from bs4 import BeautifulSoup
10-
from .utils import multi_thread
8+
119
from .color import color
12-
import sys
10+
from .validators import validate_email, validate_link
1311

1412
def get_emails(node):
1513
"""Finds all emails associated with node
@@ -21,29 +19,21 @@ def get_emails(node):
2119
emails (list): List of emails.
2220
"""
2321
emails = []
24-
response = node.response.text
25-
mails = re.findall(r'[\w\.-]+@[\w\.-]+', response)
22+
mails = re.findall(r'[\w\.-]+@[\w\.-]+', node._node.get_text())
2623
for email in mails:
27-
if LinkNode.valid_email(email):
24+
if validate_email(email):
2825
emails.append(email)
2926
return emails
3027

3128

32-
def get_links(node):
33-
"""Finds all links associated with node
34-
35-
Args:
36-
node (LinkNode): Node used to get links from.
37-
38-
Returns:
39-
links (list): List of links.
40-
"""
41-
links = []
42-
for child in node.children:
43-
link = child.get('href')
44-
if link and LinkNode.valid_link(link):
45-
links.append(link)
46-
return links
29+
def get_children(node):
30+
children = []
31+
for anchor_tag in node._node.find_all('a'):
32+
link = anchor_tag.get('href')
33+
if validate_link(link):
34+
chlid_node = LinkNode(link)
35+
children.append(chlid_node)
36+
return children
4737

4838

4939
def get_json_data(node):
@@ -56,13 +46,9 @@ def get_json_data(node):
5646
titles (list): List of Titles.
5747
"""
5848
json = []
59-
for child in node.children:
60-
link = child.get('href')
61-
title = "Not Available"
62-
if link and LinkNode.valid_link(link):
63-
node = LinkNode(link)
64-
title = node.name
65-
json.append({"link":link,"title":title})
49+
for anchor_tag in node._node.find_all('a'):
50+
link = anchor_tag.get('href')
51+
json.append({"link":link,"tag":anchor_tag})
6652
return json
6753

6854

@@ -73,26 +59,14 @@ def get_images(node):
7359
node (LinkNode): Node used to get links from.
7460
7561
Returns:
76-
links (list): List of links.
62+
imageEls (list): A collection of img HTML elements
7763
"""
78-
links = []
79-
for child in node.children:
80-
link = child.get('src')
81-
if link and LinkNode.valid_link(link):
82-
links.append(link)
83-
return links
84-
85-
86-
def get_metadata(node):
87-
"""Collect response headers.
88-
89-
Args:
90-
node (LinkNode): Node used to get metadata from.
91-
92-
Returns:
93-
metadata (dict): Dictionary with metadata.
94-
"""
95-
return node.response.headers
64+
imageEls = []
65+
for anchor_tag in node._node.find_all('a'):
66+
image = anchor_tag.get('src')
67+
if validate_link(image):
68+
imageEls.append(image)
69+
return imageEls
9670

9771

9872
class LinkNode:
@@ -105,113 +79,59 @@ def __init__(self, link):
10579
link (str): URL used to initialise node.
10680
"""
10781
# If link has invalid form, throw an error
108-
if not self.valid_link(link):
82+
if not validate_link(link):
10983
raise ValueError("Invalid link format.")
11084

111-
self._children = []
112-
self._emails = []
113-
self._links = []
114-
self._images = []
115-
self._json_data = []
116-
self._metadata = {}
85+
self._loaded = False
86+
self._name = link
87+
self._link = link
11788

118-
# Attempts to connect to link, throws an error if link is unreachable
89+
def load_data(self):
90+
response = requests.get(self._link)
91+
status = str(response.status_code)
11992
try:
120-
self.response = requests.get(link)
121-
except (requests.exceptions.ChunkedEncodingError,
122-
requests.exceptions.HTTPError,
123-
requests.exceptions.ConnectionError,
124-
ConnectionError) as err:
125-
print("Error connecting to Tor:", err)
126-
sys.exit(1)
127-
128-
self._node = BeautifulSoup(self.response.text, 'html.parser')
129-
self.uri = link
130-
if not self._node.title:
131-
self.name = "TITLE NOT FOUND"
132-
self.status = color(link, 'yellow')
133-
else:
134-
self.name = self._node.title.string
135-
self.status = color(link, 'green')
136-
137-
@property
138-
def emails(self):
139-
"""
140-
Getter for node emails
141-
"""
142-
if not self._emails:
93+
response.raise_for_status()
94+
self._metadata = response.headers
95+
self._node = BeautifulSoup(response.text, 'html.parser')
96+
self.status = color(status, 'green')
97+
self._name = self._node.title.string
14398
self._emails = get_emails(self)
144-
return self._emails
145-
146-
@property
147-
def json_data(self):
148-
"""
149-
Getter for node titles
150-
"""
151-
if not self._json_data:
99+
self._children = get_children(self)
100+
self._emails = get_emails(self)
101+
self._images = get_images(self)
152102
self._json_data = get_json_data(self)
153-
return self._json_data
103+
except Exception:
104+
self._node = None
105+
self.status = color(status, 'yellow')
106+
self._name = 'TITLE NOT FOUND'
107+
finally:
108+
self._loaded = True
154109

155-
@property
156-
def links(self):
157-
"""
158-
Getter for node links
159-
"""
160-
if not self._links:
161-
self._links = get_links(self)
162-
return self._links
163110

164-
@property
165-
def images(self):
166-
"""
167-
Getter for node images
168-
"""
169-
if not self._images:
170-
self._images = get_images(self)
171-
return self._images
111+
def get_link(self):
112+
return self._link
172113

173-
@property
174-
def children(self):
175-
"""
176-
Getter for node children
177-
"""
178-
if not self._children:
179-
self._children = self._node.find_all('a')
114+
def get_name(self):
115+
if not self._loaded:
116+
self.load_data()
117+
return self._name
118+
119+
def get_children(self):
120+
if not self._loaded:
121+
self.load_data()
180122
return self._children
181123

182-
@property
183-
def metadata(self):
184-
"""
185-
Getter for node metadata
186-
"""
187-
if not self._metadata:
188-
self._metadata = get_metadata(self)
124+
def get_emails(self):
125+
if not self._loaded:
126+
self.load_data()
127+
return self._emails
128+
129+
def get_json(self):
130+
if not self._loaded:
131+
self.load_data()
132+
return self._json_data
133+
134+
def get_meatadta(self):
135+
if not self._loaded:
136+
self.load_data()
189137
return self._metadata
190-
191-
@staticmethod
192-
def valid_email(email):
193-
"""Static method used to validate emails.
194-
195-
Args:
196-
email (str): Email string to be validated.
197-
198-
Returns:
199-
(bool): True if email string is valid, else false.
200-
"""
201-
if validators.email(email):
202-
return True
203-
return False
204-
205-
@staticmethod
206-
def valid_link(link):
207-
"""Static method used to validate links
208-
209-
Args:
210-
link (str): URL string to be validated.
211-
212-
Returns:
213-
(bool): True if URL string is valid, else false.
214-
"""
215-
if validators.url(link):
216-
return True
217-
return False

0 commit comments

Comments
 (0)