Skip to content

Commit e429c00

Browse files
Add scraping tasks for youth adidas index pages, still need to add some way to scrape the rest of the pages
1 parent 11f32ce commit e429c00

11 files changed

Lines changed: 211 additions & 51 deletions

File tree

src/solesearch_api/routes/scrape.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,12 @@
22
from fastapi import APIRouter, HTTPException, Request
33

44
from solesearch_api.tasks.scraping import (
5+
adidas_baby_and_toddler_task,
6+
adidas_children_and_little_kids_task,
7+
adidas_youth_and_big_kids_task,
58
adidas_new_releases_task,
6-
nike_instock_scraping_task,
7-
nike_scraping_task,
9+
nike_in_stock_scraping_task,
10+
nike_new_releases_task,
811
)
912

1013
router = APIRouter(
@@ -28,18 +31,22 @@ def task_result_factory(request: Request, task: AsyncResult):
2831
}
2932

3033

31-
@router.get("/{retailer}")
34+
@router.get("/{task_name:path}")
3235
async def scrape_retailer_new_releases(
33-
request: Request, retailer: str, nocache: bool = False
36+
request: Request, task_name: str, nocache: bool = False
3437
):
38+
3539
task_mapping = {
36-
"nike": nike_scraping_task,
37-
"nike_instock": nike_instock_scraping_task,
40+
"nike": nike_new_releases_task,
41+
"nike/in_stock": nike_in_stock_scraping_task,
3842
"adidas": adidas_new_releases_task,
43+
"adidas/baby": adidas_baby_and_toddler_task,
44+
"adidas/children": adidas_children_and_little_kids_task,
45+
"adidas/youth": adidas_youth_and_big_kids_task,
3946
}
40-
if retailer not in task_mapping:
41-
raise HTTPException(status_code=404, detail="Retailer not found")
42-
task = task_mapping[retailer].delay(nocache=nocache)
47+
if task_name not in task_mapping:
48+
raise HTTPException(status_code=404, detail="Task not found")
49+
task = task_mapping[task_name].delay(nocache=nocache)
4350
return task_status_factory(request, task)
4451

4552

src/solesearch_api/tasks/__init__.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,19 +30,27 @@ def init_beat(*args, **kwargs):
3030

3131
app.conf.beat_schedule = {
3232
"adidas-new-releases-every-4-hours": {
33-
"task": "solesearch_api.tasks.scraping.retail.adidas.new_releases",
33+
"task": "solesearch_api.tasks.scraping.AdidasNewReleasesScrapingTask",
3434
"schedule": 4 * 60 * 60, # every 4 hours
3535
},
36-
"adidas-pdp-every-4-hours": {
37-
"task": "solesearch_api.tasks.scraping.retail.adidas.pdp",
36+
"adidas-baby-and-toddler-every-4-hours": {
37+
"task": "solesearch_api.tasks.scraping.AdidasBabyAndToddlerScrapingTask",
38+
"schedule": 4 * 60 * 60, # every 4 hours
39+
},
40+
"adidas-children-and-little-kids-every-4-hours": {
41+
"task": "solesearch_api.tasks.scraping.AdidasChildrenAndLittleKidsScrapingTask",
42+
"schedule": 4 * 60 * 60, # every 4 hours
43+
},
44+
"adidas-youth-and-big-kids-every-4-hours": {
45+
"task": "solesearch_api.tasks.scraping.AdidasYouthAndBigKidsScrapingTask",
3846
"schedule": 4 * 60 * 60, # every 4 hours
3947
},
4048
"nike-new-releases-every-4-hours": {
41-
"task": "solesearch_api.tasks.scraping.retail.nike.new_releases",
49+
"task": "solesearch_api.tasks.scraping.NikeNewReleasesScrapingTask",
4250
"schedule": 4 * 60 * 60, # every 4 hours
4351
},
4452
"nike-in-stock-every-4-hours": {
45-
"task": "solesearch_api.tasks.scraping.retail.nike.in_stock",
53+
"task": "solesearch_api.tasks.scraping.NikeInStockScrapingTask",
4654
"schedule": 4 * 60 * 60, # every 4 hours
4755
},
4856
"healthcheck-every-minute": {

src/solesearch_api/tasks/scraping/__init__.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,19 @@
11
# Import all tasks to ensure they are registered with Celery
2+
3+
# Adidas Tasks
24
from solesearch_api.tasks.scraping.retail.adidas.new_releases import (
35
adidas_new_releases_task,
46
)
5-
from solesearch_api.tasks.scraping.retail.adidas.pdp import adidas_pdp_task
7+
from solesearch_api.tasks.scraping.retail.adidas.kids_sneakers import (
8+
adidas_baby_and_toddler_task,
9+
adidas_children_and_little_kids_task,
10+
adidas_youth_and_big_kids_task,
11+
)
12+
13+
# Nike Tasks
14+
from solesearch_api.tasks.scraping.retail.nike.new_releases import (
15+
nike_new_releases_task,
16+
)
617
from solesearch_api.tasks.scraping.retail.nike.in_stock import (
7-
nike_instock_scraping_task,
18+
nike_in_stock_scraping_task,
819
)
9-
from solesearch_api.tasks.scraping.retail.nike.new_releases import nike_scraping_task

src/solesearch_api/tasks/scraping/base.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ def get_html(
7373
file_path = os.path.join(
7474
HTML_DIR,
7575
self.brand,
76+
self.__class__.__name__,
7677
datetime.now().strftime("%Y-%m-%d.html"),
7778
)
7879
if os.path.exists(file_path):
@@ -106,6 +107,7 @@ def get_json(self) -> dict:
106107
file_path = os.path.join(
107108
JSON_DIR,
108109
self.brand,
110+
self.__class__.__name__,
109111
datetime.now().strftime("%Y-%m-%d.json"),
110112
)
111113
if os.path.exists(file_path):

src/solesearch_api/tasks/scraping/retail/adidas/base.py

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,13 @@
1-
from solesearch_api.models.enums import Audience
1+
from datetime import datetime, timezone
2+
import logging
3+
import re
4+
from solesearch_api.models.enums import Audience, Platform
5+
from solesearch_api.models.sneaker import Image, Link, Sneaker
6+
from solesearch_api.tasks.db.base import create_or_update_sneaker
27
from solesearch_api.tasks.scraping.base import BaseScrapingTask
38

9+
logger = logging.getLogger(__name__)
10+
411

512
class AdidasScrapingTask(BaseScrapingTask):
613
def __init__(self, *args, **kwargs):
@@ -13,19 +20,21 @@ def __init__(self, *args, **kwargs):
1320
"U": Audience.UNISEX,
1421
},
1522
headers={
16-
"sec-ch-ua": '"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"',
17-
"sec-ch-ua-mobile": "?0",
18-
"sec-ch-ua-platform": '"macOS"',
19-
"DNT": "1",
20-
"Origin": "https://www.adidas.com",
21-
"Host": "www.adidas.com",
22-
"Sec-Fetch-Site": "same-origin",
23-
"Sec-Fetch-Mode": "navigate",
24-
"Sec-Fetch-User": "?1",
25-
"Sec-Fetch-Dest": "document",
26-
"Accept-Language": "en-US,en;q=0.9",
27-
"Accept-Encoding": "gzip, deflate, br",
28-
"Cookie": "geo_country=US; geo_state=CA",
23+
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
24+
"accept-language": "en-US,en;q=0.9",
25+
"cache-control": "max-age=0",
26+
"priority": "u=0, i",
27+
"sec-ch-ua": '"Chromium";v="134", "Not:A-Brand";v="24", "Brave";v="134"',
28+
"sec-ch-ua-mobile": "?1",
29+
"sec-ch-ua-platform": '"Android"',
30+
"sec-fetch-dest": "document",
31+
"sec-fetch-mode": "navigate",
32+
"sec-fetch-site": "same-origin",
33+
"sec-fetch-user": "?1",
34+
"sec-gpc": "1",
35+
"upgrade-insecure-requests": "1",
36+
"Referer": "https://www.adidas.com/us/shoes",
37+
"Referrer-Policy": "strict-origin-when-cross-origin",
2938
},
3039
*args,
3140
**kwargs,
Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
from datetime import datetime, timezone
2+
import logging
3+
import re
4+
from solesearch_api.models.enums import Audience, Platform
5+
from solesearch_api.models.sneaker import Image, Link, Sneaker
6+
from solesearch_api.tasks.db.base import create_or_update_sneaker
7+
from solesearch_api.tasks.scraping.retail.adidas.base import AdidasScrapingTask
8+
from solesearch_api.tasks.scraping.task_registry import register_scraping_task
9+
from solesearch_api.utils.extractors import next_json_extractor
10+
11+
logger = logging.getLogger(__name__)
12+
13+
14+
class AdidasYouthBaseScrapingTask(AdidasScrapingTask):
15+
def __init__(self, *args, **kwargs):
16+
super().__init__(
17+
extractor=next_json_extractor,
18+
*args,
19+
**kwargs,
20+
)
21+
22+
def scrape(self, session, *args, **kwargs):
23+
json_data = self.get_json()
24+
25+
# Process each product in the products array
26+
for product in json_data.get("products", []):
27+
sneakers = self.json_to_sneakers(product)
28+
for sneaker in sneakers:
29+
create_or_update_sneaker(session, sneaker)
30+
session.commit()
31+
32+
def json_to_sneakers(self, data: dict) -> list[Sneaker] | None:
33+
sneakers = []
34+
sku = data.get("id", "").strip()
35+
if not sku:
36+
logger.warning(f"No SKU found for {self.brand} product: {data}")
37+
return None
38+
39+
price = data.get("priceData", {}).get("price")
40+
if price is not None:
41+
price = price * 100
42+
43+
sneaker = Sneaker(
44+
source=Platform.RETAIL,
45+
brand=self.brand,
46+
name=data.get("title"),
47+
sku=sku,
48+
parent_sku=data.get("modelNumber"),
49+
audience=self.audience,
50+
retail_price=price,
51+
)
52+
53+
sneaker_slug = data.get("url", "").strip()
54+
if sneaker_slug:
55+
sneaker_link = Link(
56+
url=f"https://www.adidas.com{sneaker_slug}",
57+
platform=Platform.RETAIL,
58+
)
59+
sneaker.links.append(sneaker_link)
60+
61+
for index, image in enumerate(data.get("images", [])):
62+
image_url = re.sub(r"images/[^/]+/", "images/", image.get("src"))
63+
position = image.get("metadata", {}).get("sortOrder")
64+
position = int(position) if position else index
65+
image = Image(
66+
platform=Platform.RETAIL,
67+
position=position,
68+
url=image_url,
69+
)
70+
sneaker.images.append(image)
71+
72+
sneakers.append(sneaker)
73+
74+
for variant_sku in data.get("colourVariations", []):
75+
variant_sku = variant_sku.strip()
76+
if not variant_sku:
77+
continue
78+
79+
variant_sneaker = Sneaker(
80+
source=Platform.RETAIL,
81+
brand=self.brand,
82+
name=data.get("title"),
83+
sku=variant_sku,
84+
parent_sku=data.get("modelNumber"),
85+
audience=self.audience,
86+
retail_price=price,
87+
)
88+
89+
variant_sneaker_slug = data.get("url", "").strip()
90+
if variant_sneaker_slug:
91+
variant_sneaker_slug = variant_sneaker_slug.split("/")[:-1] + [
92+
f"{variant_sku}.html"
93+
]
94+
variant_sneaker_slug = "/".join(variant_sneaker_slug)
95+
if not variant_sneaker_slug.startswith("/"):
96+
variant_sneaker_slug = f"/{variant_sneaker_slug}"
97+
variant_sneaker_link = Link(
98+
url=f"https://www.adidas.com{variant_sneaker_slug}",
99+
platform=Platform.RETAIL,
100+
)
101+
variant_sneaker.links.append(variant_sneaker_link)
102+
103+
sneakers.append(variant_sneaker)
104+
105+
return sneakers
106+
107+
108+
class AdidasBabyAndToddlerScrapingTask(AdidasYouthBaseScrapingTask):
109+
audience = Audience.TODDLER
110+
111+
def __init__(self):
112+
super().__init__(
113+
download_url="https://www.adidas.com/us/kids-infant_toddler-shoes"
114+
)
115+
116+
117+
class AdidasChildrenAndLittleKidsScrapingTask(AdidasYouthBaseScrapingTask):
118+
audience = Audience.PRESCHOOL
119+
120+
def __init__(self):
121+
super().__init__(download_url="https://www.adidas.com/us/children-shoes")
122+
123+
124+
class AdidasYouthAndBigKidsScrapingTask(AdidasYouthBaseScrapingTask):
125+
audience = Audience.GRADE_SCHOOL
126+
127+
def __init__(self):
128+
super().__init__(download_url="https://www.adidas.com/us/youth-shoes")
129+
130+
131+
adidas_baby_and_toddler_task = register_scraping_task(AdidasBabyAndToddlerScrapingTask)
132+
adidas_children_and_little_kids_task = register_scraping_task(
133+
AdidasChildrenAndLittleKidsScrapingTask
134+
)
135+
adidas_youth_and_big_kids_task = register_scraping_task(
136+
AdidasYouthAndBigKidsScrapingTask
137+
)

src/solesearch_api/tasks/scraping/retail/adidas/new_releases.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1+
from datetime import datetime, timezone
12
import logging
23
import re
3-
from datetime import datetime, timezone
44

55
from solesearch_api.models.enums import Platform
66
from solesearch_api.models.sneaker import Image, Link, Sneaker
@@ -13,8 +13,6 @@
1313

1414

1515
class AdidasNewReleasesScrapingTask(AdidasScrapingTask):
16-
name = "solesearch_api.tasks.scraping.retail.adidas.new_releases"
17-
1816
def __init__(self, *args, **kwargs):
1917
super().__init__(
2018
download_url="https://www.adidas.com/us/release-dates",
@@ -57,7 +55,7 @@ def json_to_sneaker(self, data: dict) -> Sneaker | None:
5755
sku=sku,
5856
parent_sku=data.get("model_number"),
5957
audience=self.guess_audience(data.get("attribute_list", {}).get("gender")),
60-
release_date=data.get("release_date"),
58+
release_date=release_date,
6159
retail_price=price,
6260
)
6361

src/solesearch_api/tasks/scraping/retail/adidas/pdp.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,6 @@
1111

1212

1313
class AdidasPDPScrapingTask(AdidasScrapingTask):
14-
# This is the name that Celery will look for
15-
name = "solesearch_api.tasks.scraping.retail.adidas.pdp"
16-
1714
def __init__(self, *args, **kwargs):
1815
super().__init__(
1916
extractor=react_query_data_extractor,

src/solesearch_api/tasks/scraping/retail/nike/in_stock.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,9 @@
55

66

77
class NikeInStockScrapingTask(BaseScrapingTask):
8-
name = "solesearch_api.tasks.scraping.NikeInStockScrapingTask"
9-
108
def __init__(self):
119
super().__init__(
12-
brand="NikeInStock",
10+
brand="Nike",
1311
download_url="https://www.nike.com/launch?s=in-stock",
1412
audience_mapping={
1513
"M": Audience.MEN,
@@ -23,4 +21,4 @@ def scrape(self, session, *args, **kwargs):
2321
return json_data
2422

2523

26-
nike_instock_scraping_task = register_scraping_task(NikeInStockScrapingTask)
24+
nike_in_stock_scraping_task = register_scraping_task(NikeInStockScrapingTask)

src/solesearch_api/tasks/scraping/retail/nike/new_releases.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,8 @@
2121
logfire.configure()
2222

2323

24-
class NikeScrapingTask(BaseScrapingTask):
24+
class NikeNewReleasesScrapingTask(BaseScrapingTask):
2525
# This is the exact name that Celery is looking for
26-
name = "solesearch_api.tasks.scraping.NikeScrapingTask"
27-
2826
def __init__(self):
2927
super().__init__(
3028
brand="Nike",
@@ -338,4 +336,4 @@ def json_to_sneakers(self, json: dict) -> list[Sneaker]:
338336

339337

340338
# Replace the class-based task registration with the decorator-based approach
341-
nike_scraping_task = register_scraping_task(NikeScrapingTask)
339+
nike_new_releases_task = register_scraping_task(NikeNewReleasesScrapingTask)

0 commit comments

Comments
 (0)