This cheatsheet provides a quick reference for the key features of Scrapy, a Python web crawling and web scraping framework. Use this cheatsheet as a reference to help you write Scrapy code more efficiently.
You can install Scrapy using pip:
pip install scrapy
scrapy startproject project_name
scrapy genspider spider_name domain.com
import scrapy
class SpiderName(scrapy.Spider):
name = 'spider_name'
start_urls = ['https://domain.com']
def parse(self, response):
# Code to extract data from the response
import scrapy
class ItemName(scrapy.Item):
field1 = scrapy.Field()
field2 = scrapy.Field()
yield ItemName(field1=value1, field2=value2)
class PipelineName:
def process_item(self, item, spider):
# Code to process the item
return item
ITEM_PIPELINES = {
'project_name.pipelines.PipelineName': 300,
}
BOT_NAME = 'project_name'
ROBOTSTXT_OBEY = True
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
DOWNLOAD_DELAY = 3
CONCURRENT_REQUESTS = 1
scrapy crawl spider_name