Skip to content

Commit 74e2436

Browse files
authored
Merge pull request #218 from DedSecInside/add_nlp
Add website classification
2 parents b5f7213 + e721fff commit 74e2436

7 files changed

Lines changed: 1612 additions & 3 deletions

File tree

.gitignore

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ torBot
3030

3131
venv/
3232
.venv/
33-
*.csv
3433
.DS_Store
3534
.env
36-
data/*.csv
35+
data/*.csv

poetry.lock

Lines changed: 112 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ python-dotenv = "^0.10.2"
2121
threadsafe = "^1.0.0"
2222
progress = "^1.5.0"
2323
numpy = "^1.20.2"
24+
scikit-learn = "^0.24.2"
2425

2526
[tool.poetry.dev-dependencies]
2627
pytest = "^6.2.3"

src/nlp/README.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# Natural Language Processing Library
2+
3+
This library provides tool for performing natural language processing on websites.
4+
This library is in it's infancy currently and can only be used for testing.
5+
6+
To test gathering data use:
7+
`python3 gater_data.py`
8+
* This will generate the data necessary to train the classification model
9+
10+
To predict the classification of a webiste use:
11+
`python3 main.py -website https://www.github.com`
12+
* Add `-accuracy` argument, to view the accuracy of the prediction

src/nlp/gather_data.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import csv
2+
from pathlib import Path
3+
4+
5+
def write_data():
6+
"""
7+
Writes the training data from the csv file to a directory based on the
8+
scikit-learn.datasets `load_files` specification.
9+
10+
dataset source: https://www.kaggle.com/hetulmehta/website-classification
11+
12+
e.g.
13+
container_folder/
14+
category_1_folder/
15+
file_1.txt file_2.txt file_3.txt ... file_42.txt
16+
category_2_folder/
17+
file_43.txt file_44.txt ...
18+
"""
19+
20+
with open('website_classification.csv') as csvfile:
21+
website_reader = csv.reader(csvfile, delimiter=',')
22+
for row in website_reader:
23+
[id, website, content, category] = row
24+
if category != 'category':
25+
category = category.replace('/', '+')
26+
dir_name = f"training_data/{category}"
27+
Path(dir_name).mkdir(parents=True, exist_ok=True)
28+
with open(f'{dir_name}/{id}.txt', mode='w+') as txtfile:
29+
txtfile.write(content)
30+
31+
32+
if __name__ == "__main__":
33+
write_data()

src/nlp/main.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import argparse
2+
import requests
3+
import numpy as np
4+
5+
from bs4 import BeautifulSoup
6+
from sklearn.model_selection import train_test_split
7+
from sklearn.pipeline import Pipeline
8+
from sklearn.linear_model import SGDClassifier
9+
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
10+
from sklearn.datasets import load_files
11+
12+
# get html for site
13+
parser = argparse.ArgumentParser(description='Classify Website')
14+
parser.add_argument('-website', type=str, help='Website to categorize')
15+
parser.add_argument('-accuracy', type=bool, help='Print accuracy')
16+
args = parser.parse_args()
17+
soup = BeautifulSoup(requests.get(args.website).text, features='html.parser')
18+
html = soup.get_text()
19+
20+
# create classifier
21+
clf = Pipeline([
22+
('vect', CountVectorizer()),
23+
('tfidf', TfidfTransformer()),
24+
('clf', SGDClassifier())
25+
])
26+
dataset = load_files('training_data')
27+
x_train, x_test, y_train, y_test = train_test_split(
28+
dataset.data,
29+
dataset.target
30+
)
31+
clf.fit(x_train, y_train)
32+
33+
34+
website = 'Unknown'
35+
if soup.title:
36+
website = soup.title.text
37+
38+
# returns an array of target_name values
39+
predicted = clf.predict([html])
40+
print(f'The category of {website} is {dataset.target_names[predicted[0]]}')
41+
42+
if args.accuracy:
43+
accuracy = np.mean(predicted == y_test)
44+
print(f'Accuracy: {accuracy}%')

0 commit comments

Comments
 (0)