22This module is used to create a LinkNode that can be consumued by a LinkTree
33and contains useful Link methods.
44"""
5- import requests
6- import requests .exceptions
7- import validators
85import re
6+ import requests
97from bs4 import BeautifulSoup
10- from . utils import multi_thread
8+
119from .color import color
12- import sys
10+ from . validators import validate_email , validate_link
1311
1412def get_emails (node ):
1513 """Finds all emails associated with node
@@ -21,29 +19,21 @@ def get_emails(node):
2119 emails (list): List of emails.
2220 """
2321 emails = []
24- response = node .response .text
25- mails = re .findall (r'[\w\.-]+@[\w\.-]+' , response )
22+ mails = re .findall (r'[\w\.-]+@[\w\.-]+' , node ._node .get_text ())
2623 for email in mails :
27- if LinkNode . valid_email (email ):
24+ if validate_email (email ):
2825 emails .append (email )
2926 return emails
3027
3128
32- def get_links (node ):
33- """Finds all links associated with node
34-
35- Args:
36- node (LinkNode): Node used to get links from.
37-
38- Returns:
39- links (list): List of links.
40- """
41- links = []
42- for child in node .children :
43- link = child .get ('href' )
44- if link and LinkNode .valid_link (link ):
45- links .append (link )
46- return links
29+ def get_children (node ):
30+ children = []
31+ for anchor_tag in node ._node .find_all ('a' ):
32+ link = anchor_tag .get ('href' )
33+ if validate_link (link ):
34+ chlid_node = LinkNode (link )
35+ children .append (chlid_node )
36+ return children
4737
4838
4939def get_json_data (node ):
@@ -56,13 +46,9 @@ def get_json_data(node):
5646 titles (list): List of Titles.
5747 """
5848 json = []
59- for child in node .children :
60- link = child .get ('href' )
61- title = "Not Available"
62- if link and LinkNode .valid_link (link ):
63- node = LinkNode (link )
64- title = node .name
65- json .append ({"link" :link ,"title" :title })
49+ for anchor_tag in node ._node .find_all ('a' ):
50+ link = anchor_tag .get ('href' )
51+ json .append ({"link" :link ,"tag" :anchor_tag })
6652 return json
6753
6854
@@ -73,26 +59,14 @@ def get_images(node):
7359 node (LinkNode): Node used to get links from.
7460
7561 Returns:
76- links (list): List of links.
62+ imageEls (list): A collection of img HTML elements
7763 """
78- links = []
79- for child in node .children :
80- link = child .get ('src' )
81- if link and LinkNode .valid_link (link ):
82- links .append (link )
83- return links
84-
85-
86- def get_metadata (node ):
87- """Collect response headers.
88-
89- Args:
90- node (LinkNode): Node used to get metadata from.
91-
92- Returns:
93- metadata (dict): Dictionary with metadata.
94- """
95- return node .response .headers
64+ imageEls = []
65+ for anchor_tag in node ._node .find_all ('a' ):
66+ image = anchor_tag .get ('src' )
67+ if validate_link (image ):
68+ imageEls .append (image )
69+ return imageEls
9670
9771
9872class LinkNode :
@@ -105,113 +79,59 @@ def __init__(self, link):
10579 link (str): URL used to initialise node.
10680 """
10781 # If link has invalid form, throw an error
108- if not self . valid_link (link ):
82+ if not validate_link (link ):
10983 raise ValueError ("Invalid link format." )
11084
111- self ._children = []
112- self ._emails = []
113- self ._links = []
114- self ._images = []
115- self ._json_data = []
116- self ._metadata = {}
85+ self ._loaded = False
86+ self ._name = link
87+ self ._link = link
11788
118- # Attempts to connect to link, throws an error if link is unreachable
89+ def load_data (self ):
90+ response = requests .get (self ._link )
91+ status = str (response .status_code )
11992 try :
120- self .response = requests .get (link )
121- except (requests .exceptions .ChunkedEncodingError ,
122- requests .exceptions .HTTPError ,
123- requests .exceptions .ConnectionError ,
124- ConnectionError ) as err :
125- print ("Error connecting to Tor:" , err )
126- sys .exit (1 )
127-
128- self ._node = BeautifulSoup (self .response .text , 'html.parser' )
129- self .uri = link
130- if not self ._node .title :
131- self .name = "TITLE NOT FOUND"
132- self .status = color (link , 'yellow' )
133- else :
134- self .name = self ._node .title .string
135- self .status = color (link , 'green' )
136-
137- @property
138- def emails (self ):
139- """
140- Getter for node emails
141- """
142- if not self ._emails :
93+ response .raise_for_status ()
94+ self ._metadata = response .headers
95+ self ._node = BeautifulSoup (response .text , 'html.parser' )
96+ self .status = color (status , 'green' )
97+ self ._name = self ._node .title .string
14398 self ._emails = get_emails (self )
144- return self ._emails
145-
146- @property
147- def json_data (self ):
148- """
149- Getter for node titles
150- """
151- if not self ._json_data :
99+ self ._children = get_children (self )
100+ self ._emails = get_emails (self )
101+ self ._images = get_images (self )
152102 self ._json_data = get_json_data (self )
153- return self ._json_data
103+ except Exception :
104+ self ._node = None
105+ self .status = color (status , 'yellow' )
106+ self ._name = 'TITLE NOT FOUND'
107+ finally :
108+ self ._loaded = True
154109
155- @property
156- def links (self ):
157- """
158- Getter for node links
159- """
160- if not self ._links :
161- self ._links = get_links (self )
162- return self ._links
163110
164- @property
165- def images (self ):
166- """
167- Getter for node images
168- """
169- if not self ._images :
170- self ._images = get_images (self )
171- return self ._images
111+ def get_link (self ):
112+ return self ._link
172113
173- @property
174- def children (self ):
175- """
176- Getter for node children
177- """
178- if not self ._children :
179- self ._children = self ._node .find_all ('a' )
114+ def get_name (self ):
115+ if not self ._loaded :
116+ self .load_data ()
117+ return self ._name
118+
119+ def get_children (self ):
120+ if not self ._loaded :
121+ self .load_data ()
180122 return self ._children
181123
182- @property
183- def metadata (self ):
184- """
185- Getter for node metadata
186- """
187- if not self ._metadata :
188- self ._metadata = get_metadata (self )
124+ def get_emails (self ):
125+ if not self ._loaded :
126+ self .load_data ()
127+ return self ._emails
128+
129+ def get_json (self ):
130+ if not self ._loaded :
131+ self .load_data ()
132+ return self ._json_data
133+
134+ def get_meatadta (self ):
135+ if not self ._loaded :
136+ self .load_data ()
189137 return self ._metadata
190-
191- @staticmethod
192- def valid_email (email ):
193- """Static method used to validate emails.
194-
195- Args:
196- email (str): Email string to be validated.
197-
198- Returns:
199- (bool): True if email string is valid, else false.
200- """
201- if validators .email (email ):
202- return True
203- return False
204-
205- @staticmethod
206- def valid_link (link ):
207- """Static method used to validate links
208-
209- Args:
210- link (str): URL string to be validated.
211-
212- Returns:
213- (bool): True if URL string is valid, else false.
214- """
215- if validators .url (link ):
216- return True
217- return False
0 commit comments