|
| 1 | +import argparse |
| 2 | +import requests |
| 3 | +import numpy as np |
| 4 | + |
| 5 | +from bs4 import BeautifulSoup |
| 6 | +from sklearn.model_selection import train_test_split |
| 7 | +from sklearn.pipeline import Pipeline |
| 8 | +from sklearn.linear_model import SGDClassifier |
| 9 | +from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer |
| 10 | +from sklearn.datasets import load_files |
| 11 | + |
| 12 | +# get html for site |
| 13 | +parser = argparse.ArgumentParser(description='Classify Website') |
| 14 | +parser.add_argument('-website', type=str, help='Website to categorize') |
| 15 | +parser.add_argument('-accuracy', type=bool, help='Print accuracy') |
| 16 | +args = parser.parse_args() |
| 17 | +soup = BeautifulSoup(requests.get(args.website).text, features='html.parser') |
| 18 | +html = soup.get_text() |
| 19 | + |
| 20 | +# create classifier |
| 21 | +clf = Pipeline([ |
| 22 | + ('vect', CountVectorizer()), |
| 23 | + ('tfidf', TfidfTransformer()), |
| 24 | + ('clf', SGDClassifier()) |
| 25 | +]) |
| 26 | +dataset = load_files('training_data') |
| 27 | +x_train, x_test, y_train, y_test = train_test_split( |
| 28 | + dataset.data, |
| 29 | + dataset.target |
| 30 | + ) |
| 31 | +clf.fit(x_train, y_train) |
| 32 | + |
| 33 | + |
| 34 | +website = 'Unknown' |
| 35 | +if soup.title: |
| 36 | + website = soup.title.text |
| 37 | + |
| 38 | +# returns an array of target_name values |
| 39 | +predicted = clf.predict([html]) |
| 40 | +print(f'The category of {website} is {dataset.target_names[predicted[0]]}') |
| 41 | + |
| 42 | +if args.accuracy: |
| 43 | + accuracy = np.mean(predicted == y_test) |
| 44 | + print(f'Accuracy: {accuracy}%') |
0 commit comments