-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path38 - Web Scraper with Python.py
More file actions
42 lines (29 loc) · 1.17 KB
/
38 - Web Scraper with Python.py
File metadata and controls
42 lines (29 loc) · 1.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import urllib.request
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import csv
class Scraper:
def __init__(self,site):
self.site = site
def scrape(self):
headers = {"User-Agent":"Mozilla/5.0"}
request = urllib.request.Request(self.site, headers=headers)
response = urllib.request.urlopen(request)
html = response.read()
soup = BeautifulSoup(html,"html.parser")
links = set()
for tag in soup.find_all("a", href = True):
url = tag["href"]
if url.startswith("./"):
full_url = urljoin(self.site,url)
links.add(full_url)
return list(links)
news = "https://news.google.com/"
scraper = Scraper(news)
article_links = scraper.scrape()
with open("news_articles.csv", "w", newline="", encoding="utf-8") as file:
writer = csv.writer(file)
writer.writerow(["Article_URL"]) # header
for link in article_links:
writer.writerow([link])
print(f"✅ Saved {len(article_links)} links to news_articles.csv")