Skip to content

Commit 2c482ae

Browse files
committed
Change web scraping library to playwright
1 parent d12b61a commit 2c482ae

5 files changed

Lines changed: 77 additions & 56 deletions

File tree

.DS_Store

6 KB
Binary file not shown.

.gitignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
# Act bin file for local testing
2+
bin/
3+
14
# Byte-compiled / optimized / DLL files
25
__pycache__/
36
*.py[cod]
@@ -158,3 +161,6 @@ cython_debug/
158161
# and can be added to the global gitignore or merged into this file. For a more nuclear
159162
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
160163
#.idea/
164+
165+
playwright/.auth
166+
screen-shots

funcs/replit_playwright.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
from playwright.sync_api import sync_playwright
2+
from playwright_stealth import stealth_sync
3+
import os
4+
from dotenv import load_dotenv
5+
load_dotenv()
6+
7+
ua = (
8+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
9+
"AppleWebKit/537.36 (KHTML, like Gecko) "
10+
"Chrome/69.0.3497.100 Safari/537.36"
11+
)
12+
13+
test_url = "https://replit.com/@pythondojoarchi/SlipperyGargantuanDebuggers"
14+
15+
with sync_playwright() as p:
16+
# browser = p.chromium.launch(headless=False
17+
# , slow_mo=50
18+
# )
19+
browser = p.chromium.launch()
20+
context = browser.new_context(user_agent=ua)
21+
page = context.new_page()
22+
stealth_sync(page)
23+
page.goto(os.environ['LOGINURL'], wait_until="domcontentloaded")
24+
page.screenshot(path="./screen-shots/replit.png")
25+
26+
# Login
27+
page.locator("xpath=/html/body/div[1]/div/div[2]/div/main/div[2]/div/form/div[1]/input").fill(os.environ['EMAIL'])
28+
page.locator("xpath=/html/body/div[1]/div/div[2]/div/main/div[2]/div/form/div[2]/div/input").fill(os.environ['PASSWORD'])
29+
page.locator("xpath=/html/body/div[1]/div/div[2]/div/main/div[2]/div/form/div[3]/button").click()
30+
page.wait_for_url("https://replit.com/~")
31+
page.screenshot(path="./screen-shots/replit_after_login.png")
32+
33+
# Download repo files as zip
34+
page.goto(test_url, wait_until="domcontentloaded")
35+
page.locator("xpath=/html/body/div[1]/div[1]/div[1]/div[2]/div/div[1]/div/div[3]/div/div[1]/button/div/span").wait_for()
36+
while page.locator("xpath=/html/body/div[1]/div[1]/div[1]/div[2]/header/div[2]/button").text_content() != "Run":
37+
print(page.locator("xpath=/html/body/div[1]/div[1]/div[1]/div[2]/header/div[2]/button").text_content())
38+
page.wait_for_timeout(2000)
39+
page.screenshot(path="./screen-shots/target_page.png")
40+
41+
page.locator("xpath=/html/body/div[1]/div[1]/div[1]/div[2]/div/div[1]/div/div[2]/div[1]/div[1]/div/div[3]/button").click()
42+
with page.expect_download() as download_info:
43+
page.locator("xpath=/html/body/div[@class='css-1o92kwk']//div[@id='item-4']//div[@class='css-1l2rn59']").click()
44+
download = download_info.value
45+
download.save_as(f"./screen-shots/{download.suggested_filename}")
46+
47+
# Clean-up
48+
context.close()
49+
browser.close()
50+
51+
52+

funcs/replit_scrapper.py

Lines changed: 0 additions & 56 deletions
This file was deleted.

requirements.txt

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,23 @@
1+
certifi==2023.11.17
2+
charset-normalizer==3.3.2
13
flake8==6.1.0
4+
greenlet==3.0.1
5+
idna==3.6
6+
iniconfig==2.0.0
27
mccabe==0.7.0
8+
packaging==23.2
9+
playwright==1.40.0
10+
playwright-stealth==1.0.6
11+
pluggy==1.3.0
312
pycodestyle==2.11.0
13+
pyee==11.0.1
414
pyflakes==3.1.0
15+
pytest==7.4.3
16+
pytest-base-url==2.0.0
17+
pytest-playwright==0.4.3
18+
python-dotenv==1.0.0
19+
python-slugify==8.0.1
20+
requests==2.31.0
21+
text-unidecode==1.3
22+
typing_extensions==4.9.0
23+
urllib3==2.1.0

0 commit comments

Comments
 (0)