Skip to content

Commit 794881a

Browse files
committed
add category_limit, store_limit, product_limit
1 parent 3a8d09e commit 794881a

File tree

3 files changed

+37
-24
lines changed

3 files changed

+37
-24
lines changed

README.md

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,32 @@
1-
```bash
2-
scrapy crawl 1_stores -a store="https://shop.despar.com"
3-
```
1+
## How to run
2+
1) **Stores Spyder** scrapes stores info in the domain [[shop.despar.com](https://shop.despar.com), [shop.desparsicilia.it](https://shop.desparsicilia.it)]
43

5-
``` bash
6-
scrapy crawl 2_product_list -a dev=True
7-
```
4+
```bash
5+
scrapy crawl 1_stores -a store="https://shop.despar.com"
6+
```
87

9-
```bash
10-
scrapy crawl 3_product_details -a dev=True
11-
```
8+
2) **Product List Spyder** scrapes the categories, products-list on each category and the promos in this store
9+
10+
``` bash
11+
scrapy crawl 2_product_list -a store_limit=1 -a category_limit=3
12+
```
13+
14+
3) **Product Details Spyder** scrapes the product descriptions and images links
15+
```bash
16+
scrapy crawl 3_product_details -a product_limit=3
17+
```
1218

1319
---
1420

21+
## Endpoints
22+
23+
```https://{domain}/spesa-consegna-domicilio/{zip_code}```
24+
25+
```https://{domain}/spesa-ritiro-negozio/{store_slug}```
26+
27+
```{store_url}/{category_slug}```
1528

16-
https://{ domain }/spesa-consegna-domicilio/{ zip_code }
17-
https://{ domain }/spesa-ritiro-negozio/{ store_slug }
29+
```{store_url}/prodotto/{product_id}```
1830

19-
{ store_url }/{ category_slug }
20-
{ store_url }/prodotto/{ product_id }
21-
{ store_url }/ajax/productsPagination?{ category_id, page_num }
31+
```{store_url}/ajax/productsPagination?{category_id,page_num}```
2232

despar_scraper/spiders/2_product_list.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -35,15 +35,16 @@ class ProductListSpider(scrapy.Spider):
3535
}
3636
}
3737

38-
def __init__(self, store_list_file='data/json/stores.json', dev=None, *args, **kwargs):
38+
def __init__(self, store_list_file='data/json/stores.json', store_limit='0', category_limit='0', *args, **kwargs):
3939
super().__init__(*args, **kwargs)
4040
self.store_list_file = store_list_file
41-
self.dev = dev
41+
self.category_limit = int(category_limit)
42+
self.store_limit = int(store_limit)
4243

4344
async def start(self):
4445
# open json file in async function
4546
with open(self.store_list_file, 'r') as f:
46-
store_list = json.load(f) if not self.dev else json.load(f)[:1]
47+
store_list = json.load(f) if not self.store_limit else json.load(f)[:self.store_limit]
4748

4849
for store in store_list:
4950
yield scrapy.Request(
@@ -54,7 +55,7 @@ async def start(self):
5455

5556
def parse__get_categories(self, response: HtmlResponse):
5657
store = response.meta['store']
57-
58+
cat_count = 0
5859
# Extract categories
5960
for main_tag in response.css('div.main-navigation__item--container'):
6061
main_category = main_tag.css('div.main-navigation__item--title > span::text').get(default='').strip()
@@ -105,9 +106,11 @@ def parse__get_categories(self, response: HtmlResponse):
105106
'page': 1, 'attempt': 1
106107
},
107108
)
108-
109-
if self.dev:
110-
return None
109+
110+
cat_count += 1
111+
if self.category_limit:
112+
if cat_count >= self.category_limit:
113+
return None
111114

112115

113116
def parse__get_products_list(self, response: HtmlResponse):

despar_scraper/spiders/3_product_details.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,15 @@ class ProductDetailsSpider(scrapy.Spider):
2121
}
2222
}
2323

24-
def __init__(self, product_list_file='data/json/product_list.json', dev=False, *args, **kwargs):
24+
def __init__(self, product_list_file='data/json/product_list.json', product_limit='0', *args, **kwargs):
2525
super().__init__(*args, **kwargs)
2626
self.product_list_file = product_list_file
27-
self.dev = dev
27+
self.product_limit = int(product_limit)
2828

2929
async def start(self):
3030
# open json file in async function
3131
with open(self.product_list_file, 'r') as f:
32-
product_list = json.load(f) if not self.dev else json.load(f)[:10]
32+
product_list = json.load(f) if not self.product_limit else json.load(f)[:self.product_limit]
3333

3434
for prod in product_list:
3535
yield scrapy.Request(

0 commit comments

Comments
 (0)