How to scrape multiple pages from a website? - python

(Very) New to Python and programming in general
I've been trying to scrape data from more pages/section of the same website with Scrapy
My code works, but it's unreadable and not practical
import scrapy
class SomeSpider(scrapy.Spider):
name = 'some'
allowed_domains = ['https://example.com']
start_urls = [
'https://example.com/Python/?k=books&p=1',
'https://example.com/Python/?k=books&p=2',
'https://example.com/Python/?k=books&p=3',
'https://example.com/Python/?k=tutorials&p=1',
'https://example.com/Python/?k=tutorials&p=2',
'https://example.com/Python/?k=tutorials&p=3',
]
def parse(self, response):
response.selector.remove_namespaces()
info1 = response.css("scrapedinfo1").extract()
info2 = response.css("scrapedinfo2").extract()
for item in zip(scrapedinfo1, scrapedinfo2):
scraped_info = {
'scrapedinfo1': item[0],
'scrapedinfo2': item[1]}
yield scraped_info
How can I improve this?
I'd like to search within a certain amount of categories and pages
I need something like
categories = [books, tutorials, a, b, c, d, e, f]
in a range(1,3)
So that Scrapy would be able to do its job through all categories and pages, while being easy to edit and adapt to other websites
Any ideas are welcome
What I have tried:
categories = ["books", "tutorials"]
base = "https://example.com/Python/?k={category}&p={index}"
def url_generator():
for category, index in itertools.product(categories, range(1, 4)):
yield base.format(category=category, index=index)
But Scrapy returns
[scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min),
scraped 0 items (at 0 items/min)

Solved thanks to start_requests() and yield scrapy.Request()
Here's the code
import scrapy
import itertools
class SomeSpider(scrapy.Spider):
name = 'somespider'
allowed_domains = ['example.com']
def start_requests(self):
categories = ["books", "tutorials"]
base = "https://example.com/Python/?k={category}&p={index}"
for category, index in itertools.product(categories, range(1, 4)):
yield scrapy.Request(base.format(category=category, index=index))
def parse(self, response):
response.selector.remove_namespaces()
info1 = response.css("scrapedinfo1").extract()
info2 = response.css("scrapedinfo2").extract()
for item in zip(info1, info2):
scraped_info = {
'scrapedinfo1': item[0],
'scrapedinfo2': item[1],
}
yield scraped_info

You can use method start_requests() to generate urls at start using yield Request(url).
BTW: Later in parse() you can also use yield Request(url) to add new url.
I use portal toscrape.com which was created for testing spiders.
import scrapy
class MySpider(scrapy.Spider):
name = 'myspider'
allowed_domains = ['http://quotes.toqoute.com']
#start_urls = []
tags = ['love', 'inspirational', 'life', 'humor', 'books', 'reading']
pages = 3
url_template = 'http://quotes.toscrape.com/tag/{}/page/{}'
def start_requests(self):
for tag in self.tags:
for page in range(self.pages):
url = self.url_template.format(tag, page)
yield scrapy.Request(url)
def parse(self, response):
# test if method was executed
print('url:', response.url)
# --- run it without project ---
from scrapy.crawler import CrawlerProcess
#c = CrawlerProcess({
# 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
# 'FEED_FORMAT': 'csv',
# 'FEED_URI': 'output.csv',
#}
c = CrawlerProcess()
c.crawl(MySpider)
c.start()

Related

Why my Scrapy Response.follow is not following the links and returns empty?

Below you can see my spider (in this case is just a test). As i test, i first run the parse_remedios as the main parse an it returns the results. But when i use as the second function (parse_remedios, like in the code below) the results are empty. I know that has something about the response.follow is not running well. Any ideas?
import scrapy
class RemediosSpider(scrapy.Spider):
name = 'remedios'
allowed_domains = ['www.drogariaspachecopacheco.com.br']
start_urls = ['https://www.drogariaspacheco.com.br/clorana%2025mg']
def parse(self, response):
print(response.url)
for link in response.css('.collection-link::attr(href)'):
yield response.follow(link.get(), callback=self.parse_remedios)
def parse_remedios(self, response):
resultado = response.css('.container-fluid')
yield {
'nome' : resultado.css('.productName::text').get(),
'preco' : resultado.css('.skuBestPrice::text').get() ,
'link' : response.url,
'sku' : resultado.css('.skuReference::text').get()
}
The problem is with your allowed_domains. Scrapy is filtering all of the links after the start_urls because they do not match any of the domains in your allowed_domains list. I have corrected in the example below
you can see it in the output logs
2022-09-15 21:38:24 [scrapy.spidermiddlewares.offsite] DEBUG: Filtered offsite request to 'www.drogariaspacheco.com.br': <GET https://www.drogariaspacheco.com.br/clorana-25mg-sanofi-aventis-30-comprimidos/p>
import scrapy
class RemediosSpider(scrapy.Spider):
name = 'remedios'
allowed_domains = ['drogariaspacheco.com.br']
start_urls = ['https://www.drogariaspacheco.com.br/clorana%2025mg']
def parse(self, response):
for link in response.css('.collection-link::attr(href)'):
yield response.follow(link.get(), callback=self.parse_remedios)
def parse_remedios(self, response):
resultado = response.css('.container-fluid')
yield {
'nome' : resultado.css('.productName::text').get(),
'preco' : resultado.css('.skuBestPrice::text').get() ,
'link' : response.url,
'sku' : resultado.css('.skuReference::text').get()
}

Scrapy-crawled-200 Referer-None

I'm trying to learn how to use scrapy and python but I'm not an expert at all...
I have an empty file after crawling this page :
so.news.com
and I don't understand why...
Here is my code :
import scrapy
class XinhuaSpider(scrapy.Spider):
name = 'xinhua'
allowed_domains = ['xinhuanet.com']
start_urls = ['http://so.news.cn/?keyWordAll=&keyWordOne=%E6%96%B0%E5%86%A0+%E8%82%BA%E7%82%8E+%E6%AD%A6%E6%B1%89+%E7%97%85%E6%AF%92&keyWordIg=&searchFields=1&sortField=0&url=&senSearch=1&lang=cn#search/0/%E6%96%B0%E5%86%A0/1/']
def parse(self, response):
#titles = response.css('#newsCon > div.newsList > div.news > h2 > a::text').extract()
#date = response.css('#newsCon > div.newsList > div.news> div > p.newstime > span::text').extract()
titles = response.xpath("/html/body/div[#id='search-result']/div[#class='resultCnt']/div[#id='resultList']/div[#class='newsListCnt secondlist']/div[#id='newsCon']/div[#class='newsList']/div[#class='news']/h2/a/text()").extract()
date = response.xpath("/html/body/div[#id='search-result']/div[#class='resultCnt']/div[#id='resultList']/div[#class='newsListCnt secondlist']/div[#id='newsCon']/div[#class='newsList']/div[#class='news']/div[#class='easynews']/p[#class='newstime']/span/text()").extract()
for item in zip(titles,date):
scraped_info ={
"title" : item[0],
"date" : item[1],
}
yield scraped_info
nextPg = response.xpath("/html/body/div[#id='search-result']/div[#class='resultCnt']/div[#id='pagination']/a[#class='next']/#href").extract()
if nextPg is not None:
print(nextPg)
This is the messenage in console:
2020-05-11 00:09:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://so.news.cn/?keyWordAll=&keyWordOne=%E6%96%B0%E5%86%A0+%E8%82%BA%E7%82%8E+%E6%AD%A6%E6%B1%89+%E7%97%85%E6%AF%92&keyWordIg=&searchFields=1&sortField=0&url=&senSearch=1&lang=cn#search/0/%E6%96%B0%E5%86%A0/1/> (referer: None)
[]
You need always check page's source code (Ctrl+U) in your browser. Content you see in your browser maybe loaded using XHR Javascript call. Here is code that works for me (I found correct start url using Chrome Developer Console):
import scrapy
import json
import re
class XinhuaSpider(scrapy.Spider):
name = 'xinhua'
# allowed_domains = ['xinhuanet.com']
start_urls = ['http://so.news.cn/getNews?keyWordAll=&keyWordOne=%25E6%2596%25B0%25E5%2586%25A0%2B%25E8%2582%25BA%25E7%2582%258E%2B%25E6%25AD%25A6%25E6%25B1%2589%2B%25E7%2597%2585%25E6%25AF%2592&keyWordIg=&searchFields=1&sortField=0&url=&senSearch=1&lang=cn&keyword=%E6%96%B0%E5%86%A0&curPage=1']
def parse(self, response):
data = json.loads(response.body)
for item in data["content"]["results"]:
scraped_info ={
"title" : item['title'],
"date" : item['pubtime'],
}
yield scraped_info
current_page = data['content']['curPage']
total_pages = data['content']['pageCount']
if current_page < total_pages:
next_page = re.sub(r'curPage=\d+', f"curPage={current_page + 1}", response.url)
yield scrapy.Request(
url=next_page,
callback=self.parse,
)

cannot extract javascript using scrapy from webpage

I am working on extracting data from indeed.com for a data science project I am working on. Though I am able to successfully scrape various portions of the page, I am having some issues scraping items from within the JSON portion of the page.
Does anyone know how I would extract the items below from the URL ? >>> view-source:https://www.indeed.com/viewjob?jk=41abec7fde3513dc&tk=1dn0mslbr352v000&from=serp&vjs=3&advn=9434814581076032&adid=197003786&sjdu=BbcXv7z69Xez4bal0Fx7iYB6jxzlBG3p6CfmfgjyGDErM4mqXgOsfEsOF5maJ2GRnKJsHskFl8aEbb4LlD5LibXOuIs0dzzHfVCmKB00C2c43rDVhEZX_8Zmg4zqEyqG5LEfQjRfoyOhULxXHTMitWOUjMOdLRt367-ZewSzfkqUSnPzHungl7uY7NcfOFLy .
Items to be extracted below: \nPOT-Creation-Date: \nPO-Revision-Date: "jobLocation":"Arlington, TX
A sample script I am running is below
import scrapy
from scrapy import Request
from scrapy.crawler import CrawlerProcess
import boto3
class JobsSpider1(scrapy.Spider):
name = "indeed"
allowed_domains = ["indeed.com"]
start_urls = ["https://www.indeed.com/jobs?q=\"owner+operator\"+\"truck\"&l=augusta"]
custom_settings = {
'FEED_FORMAT': 'json',
'FEED_URI':'me_test.json'
}
def parse(self, response):
jobs = response.xpath('//div[#class="title"]')
for job in jobs:
title = job.xpath('a//#title').extract_first()
posting_link = job.xpath('a//#href').extract_first()
posting_url = "https://indeed.com" + posting_link
yield Request(posting_url, callback=self.parse_page, meta={'title': title, 'posting_url':posting_url})
relative_next_url = response.xpath('//link[#rel="next"]/#href').extract_first()
absolute_next_url = "https://indeed.com" + relative_next_url
yield Request(absolute_next_url, callback=self.parse)
def parse_page(self, response):
posting_url = response.meta.get('posting_url')
job_title = response.meta.get('title')
#job_name= response.xpath('//*[#class="icl-u-xs-mb--xs icl-u-xs-mt--none jobsearch-JobInfoHeader-title"]/text()').extract_first()
job_descriptions=response.xpath('//*[#class="jobsearch-jobDescriptionText"]/ul').extract_first()
job_listing_header=response.xpath('//*[#class="jobSectionHeader"]/ul').extract_first()
posted_on_date= response.xpath('//*[#class="jobsearch-JobMetadataFooter"]/text()').extract_first()
job_location=response.xpath('//*[#class="jobsearch-InlineCompanyRating icl-u-xs-mt--xs jobsearch-DesktopStickyContainer-companyrating"]/div[3]/text()').extract_first()
yield {
'job_title':job_title,
'posting_url':posting_url,
# 'job_name':job_name,
'job_listing_header':job_listing_header,
'job_location': job_location,
'job_descriptions':job_descriptions,
'posted_on_date':posted_on_date
}

INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)

I just began to learn Python and Scrapy.
My first project is to crawl information on a website containing web security information. But when I run that using cmd, it says that "Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)" and nothing seems to come out. I'd be grateful if someone kind could solve my problem.
My code:
import scrapy
class SapoSpider(scrapy.Spider):
name = "imo"
allowed_domains = ["imovirtual.com"]
start_urls = ["https://www.imovirtual.com/arrendar/apartamento/lisboa/"]
def parse(self,response):
subpage_links = []
for i in response.css('div.offer-item-details'):
youritem = {
'preco':i.css('span.offer-item title::text').extract_first(),
'autor':i.css('li.offer-item-price::text').extract(),
'data':i.css('li.offer-item-area::text').extract(),
'data_2':i.css('li.offer-item-price-perm::text').extract()
}
subpage_link = i.css('header[class=offer-item-header] a::attr(href)').extract()
subpage_links.extend(subpage_link)
for subpage_link in subpage_links:
yield scrapy.Request(subpage_link, callback=self.parse_subpage, meta={'item':youritem})
def parse_subpage(self,response):
for j in response.css('header[class=offer-item-header] a::attr(href)'):
youritem = response.meta.get('item')
youritem['info'] = j.css(' ul.dotted-list, li.h4::text').extract()
yield youritem
There are two things to correct to make it work:
You need to define FEED_URI setting with the path you want to store the result
You need to use response in parse_subpage because the logic is the following: scrapy downloads "https://www.imovirtual.com/arrendar/apartamento/lisboa/" and gives the response toparse, you extract ads url and you ask scrapy to download each pages and give the downloaded pages toparse_subpage. Soresponseinparse_subpage` corresponds to this https://www.imovirtual.com/anuncio/t0-totalmente-remodelado-localizacao-excelente-IDGBAY.html#913474cdaa for example
This should work:
import scrapy
class SapoSpider(scrapy.Spider):
name = "imo"
allowed_domains = ["imovirtual.com"]
start_urls = ["https://www.imovirtual.com/arrendar/apartamento/lisboa/"]
custom_settings = {
'FEED_URI': './output.json'
}
def parse(self,response):
subpage_links = []
for i in response.css('div.offer-item-details'):
youritem = {
'preco':i.css('span.offer-item title::text').extract_first(),
'autor':i.css('li.offer-item-price::text').extract(),
'data':i.css('li.offer-item-area::text').extract(),
'data_2':i.css('li.offer-item-price-perm::text').extract()
}
subpage_link = i.css('header[class=offer-item-header] a::attr(href)').extract()
subpage_links.extend(subpage_link)
for subpage_link in subpage_links:
yield scrapy.Request(subpage_link, callback=self.parse_subpage, meta={'item':youritem})
def parse_subpage(self,response):
youritem = response.meta.get('item')
youritem['info'] = response.css(' ul.dotted-list, li.h4::text').extract()
yield youritem

Scrapy delay request

every time i run my code my ip gets banned. I need help to delay each request for 10 seconds. I've tried to place DOWNLOAD_DELAY in code but it gives no results. Any help is appreciated.
# item class included here
class DmozItem(scrapy.Item):
# define the fields for your item here like:
link = scrapy.Field()
attr = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["craigslist.org"]
start_urls = [
"https://washingtondc.craigslist.org/search/fua"
]
BASE_URL = 'https://washingtondc.craigslist.org/'
def parse(self, response):
links = response.xpath('//a[#class="hdrlnk"]/#href').extract()
for link in links:
absolute_url = self.BASE_URL + link
yield scrapy.Request(absolute_url, callback=self.parse_attr)
def parse_attr(self, response):
match = re.search(r"(\w+)\.html", response.url)
if match:
item_id = match.group(1)
url = self.BASE_URL + "reply/nos/vgm/" + item_id
item = DmozItem()
item["link"] = response.url
return scrapy.Request(url, meta={'item': item}, callback=self.parse_contact)
def parse_contact(self, response):
item = response.meta['item']
item["attr"] = "".join(response.xpath("//div[#class='anonemail']//text()").extract())
return item
You need to set DOWNLOAD_DELAY in settings.py of your project. Note that you may also need to limit concurrency. By default concurrency is 8 so you are hitting website with 8 simultaneous requests.
# settings.py
DOWNLOAD_DELAY = 1
CONCURRENT_REQUESTS_PER_DOMAIN = 2
Starting with Scrapy 1.0 you can also place custom settings in spider, so you could do something like this:
class DmozSpider(Spider):
name = "dmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/",
]
custom_settings = {
"DOWNLOAD_DELAY": 5,
"CONCURRENT_REQUESTS_PER_DOMAIN": 2
}
Delay and concurrency are set per downloader slot not per requests. To actually check what download you have you could try something like this
def parse(self, response):
"""
"""
delay = self.crawler.engine.downloader.slots["www.dmoz.org"].delay
concurrency = self.crawler.engine.downloader.slots["www.dmoz.org"].concurrency
self.log("Delay {}, concurrency {} for request {}".format(delay, concurrency, response.request))
return

Categories

Resources