Web Scraping all Urls from a website with Scrapy and Python

Web Scraping all Urls from a website with Scrapy and Python - python

I am writing a web scraper to fetch a group of links
(located at tree.xpath('//div[#class="work_area_content"]/a/#href')
from a website and return the Title and Url of all the leafs sectioned by the leafs parent. I have two scrapers: one in python and one in Scrapy for Python. What is the purpose of callbacks in the Scrapy Request method? Should the information be in a multidimensional or single dimension list ( I believe multi-dimensional but it enhances complication)? Which of the below code is better? If the scraper code is better, how do I migrate the python code to the Scrapy code?
From what I understand from callbacks is that it passes a function's arguments to another function; however, if the callback refers to itself, the data gets overwritten and therefore lost, and you're unable to go back to the root data. Is this correct?
python:
url_storage = [ [ [ [] ] ] ]
page = requests.get('http://1.1.1.1:1234/TestSuites')
tree = html.fromstring(page.content)
urls = tree.xpath('//div[#class="work_area_content"]/a/#href').extract()
i = 0
j = 0
k = 0
for i, url in enumerate(urls):
absolute_url = "".join(['http://1.1.1.1:1234/', url])
url_storage[i][j][k].append(absolute_url)
print(url_storage)
#url_storage.insert(i, absolute_url)
page = requests.get(url_storage[i][j][k])
tree2 = html.fromstring(page.content)
urls2 = tree2.xpath('//div[#class="work_area_content"]/a/#href').extract()
for j, url2 in enumerate(urls2):
absolute_url = "".join(['http://1.1.1.1:1234/', url2])
url_storage[i][j][k].append(absolute_url)
page = requests.get(url_storage[i][j][k])
tree3 = html.fromstring(page.content)
urls3 = tree3.xpath('//div[#class="work_area_content"]/a/#href').extract()
for k, url3 in enumerate(urls3):
absolute_url = "".join(['http://1.1.1.1:1234/', url3])
url_storage[i][j][k].append(absolute_url)
page = requests.get(url_storage[i][j][k])
tree4 = html.fromstring(page.content)
urls3 = tree4.xpath('//div[#class="work_area_content"]/a/#href').extract()
title = tree4.xpath('//span[#class="page_title"]/text()').extract()
yield Request(url_storage[i][j][k], callback=self.end_page_parse_TS, meta={"Title": title, "URL": urls3 })
#yield Request(absolute_url, callback=self.end_page_parse_TC, meta={"Title": title, "URL": urls3 })
def end_page_parse_TS(self, response):
print(response.body)
url = response.meta.get('URL')
title = response.meta.get('Title')
yield{'URL': url, 'Title': title}
def end_page_parse_TC(self, response):
url = response.meta.get('URL')
title = response.meta.get('Title')
description = response.meta.get('Description')
description = response.xpath('//table[#class="wiki_table]/tbody[contains(/td/text(), "description")/parent').extract()
yield{'URL': url, 'Title': title, 'Description':description}
Scrapy:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractor import LinkExtractor
from scrapy.spiders import Rule, CrawlSpider
from datablogger_scraper.items import DatabloggerScraperItem
class DatabloggerSpider(CrawlSpider):
# The name of the spider
name = "datablogger"
# The domains that are allowed (links to other domains are skipped)
allowed_domains = ['http://1.1.1.1:1234/']
# The URLs to start with
start_urls = ['http://1.1.1.1:1234/TestSuites']
# This spider has one rule: extract all (unique and canonicalized) links, follow them and parse them using the parse_items method
rules = [
Rule(
LinkExtractor(
canonicalize=True,
unique=True
),
follow=True,
callback="parse_items"
)
]
# Method which starts the requests by visiting all URLs specified in start_urls
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, callback=self.parse, dont_filter=True)
# Method for parsing items
def parse_items(self, response):
# The list of items that are found on the particular page
items = []
# Only extract canonicalized and unique links (with respect to the current page)
links = LinkExtractor(canonicalize=True, unique=True).extract_links(response)
# Now go through all the found links
item = DatabloggerScraperItem()
item['url_from'] = response.url
for link in links:
item['url_to'] = link.url
items.append(item)
# Return all the found items
return items

Related

Extracting next page and setting a break

I'm trying to extract webpage data and wished to take the next few pages also but up to a limit, which I can alter. However, I've tested to see if I can at least extract the next few web-pages using Scrapy (As I'm trying to figure this out in Scrapy to learn it), but It only returns the items within the first page.
How do I extract the next pages while setting a limit i.e. 5 pages
For example, here's what I have tried:
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
class StatisticsItem(scrapy.Item):
ebay_div = Field(output_processor=TakeFirst())
url = Field(output_processor=TakeFirst())
class StatisticsSpider(scrapy.Spider):
name = 'ebay'
start_urls = ['https://www.ebay.com/b/Collectible-Card-Games-Accessories/2536/bn_1852210?rt=nc&LH_BIN=1' +
'&LH_PrefLoc=2&mag=1&_sop=16']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url
)
def parse(self, response):
all_cards = response.xpath('//div[#class="s-item__wrapper clearfix"]')
for card in all_cards:
name = card.xpath('.//h3/text()').get() #get name of product
price = card.xpath('.//span[#class="s-item__price"]//text()').get() #price
product_url = card.xpath('.//a[#class="s-item__link"]//#href').get() #link to product
# now do whatever you want, append to dictionary, yield as item...
summary_data = {
"Name": name,
"Price": price,
"URL": product_url
}
data = {'summary_data': summary_data}
yield scrapy.Request(product_url, meta=data, callback=self.parse_product_details)
# get the next page
next_page_url = card.xpath('.//a[#class="pagination__next icon-link"]/#href').extract_first()
# The last page do not have a valid url and ends with '#'
if next_page_url == None or str(next_page_url).endswith("#"):
self.log("eBay products collected successfully !!!")
else:
print('\n' + '-' * 30)
print('Next page: {}'.format(next_page_url))
yield scrapy.Request(next_page_url, callback=self.parse)
def parse_product_details(self, response):
# Get the summary data
data = response.meta['summary_data']
data['location'] = response.xpath('//span[#itemprop="availableAtOrFrom"]/text()').extract_first()
yield data
process = CrawlerProcess(
settings={
'FEED_URI': 'collectible_cards.json',
'FEED_FORMAT': 'jsonlines'
}
)
process.crawl(StatisticsSpider)
process.start()

You can try like this first make urls then start start_requests
start_urls = ["https://www.ebay.com/b/Collectible-Card-Games-Accessories/2536/bn_1852210?LH_BIN=1&LH_PrefLoc=2&mag=1&rt=nc&_pgn={}&_sop=16".format(i) for i in range(1,5)]

Navigate to new page in Scrapy with the same URL

I am writing a scrapy spider to scrape Rightmove, a property website. The issue I'm having is that the property search, which consists of several pages of different house listings, is all located under the same URL.
This means that the usual process of identifying the URL of the 'next' page doesn't work. Is there any way, using scrapy and not selenium (not efficient enough for the purpose) that I can navigate through the different pages? Please see my code and the source code of the relevant 'next page' button as the IMG below.
Thanks.
class listingsSpider(scrapy.Spider):
name = 'listings'
start_urls = ['https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=STATION%5E1712&maxPrice=500000&radius=0.5&sortType=10&propertyTypes=&mustHave=&dontShow=&furnishTypes=&keywords=']
def parse(self, response):
self.logger.info('This my first spider')
address = response.xpath('//*[#id="property-65695633"]/div/div/div[4]/div[1]/div[2]/a/address')
listings = response.xpath('//h2[#class="propertyCard-title"]')
for listing in listings:
yield{
'Listing': listing.get()
}
nextPage = response.xpath('//*[#id="l-container"]/div[3]/div/div/div/div[3]/button/div/svg/use')
nextPage = nextPage.get()
pageTest = response.css('div[class=pagination-button pagination-direction pagination-direction--next] svg a::attr(href)')
pageTest = pageTest.get()
if pageTest is not None:
pageTest = response.urljoin(pageTest)
yield scrapy.Request(pageTest,callback=self.parse)
```[![enter image description here][1]][1]
[1]: https://i.stack.imgur.com/1I1J1.png

Actually, it turns out that each page has a unique identifier in the web-link. For example, attach &index = 24, this sends you to the next page.
What you need to figure out is how to include that into the request url. Some may have several pages so we increment by +24 each time to go onto the next page. However, we could increment by +24 onto infinite, therefore we use the number of page results as a way to break. It's rather sneaky to notice at first sight! but pretty easy to overcome.
Here's a scraper that can go to these next pages as requested:
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
import requests
from bs4 import BeautifulSoup
links= []
for i in range(0, 480, 24):
url = f'https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=STATION%5E1712&maxPrice=500000&radius=0.5&sortType=10&propertyTypes=&mustHave=&dontShow=&index={i}&furnishTypes=&keywords='
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml')
ps1 = soup.find_all('span', {'class':'searchHeader-resultCount'})
for ps in ps1:
if int(ps.text.strip()) > i:
links.append(url)
else:
break
class ListingsItem(scrapy.Item):
address = Field(output_processor = TakeFirst())
listings = Field(output_processor = TakeFirst())
class listingsSpider(scrapy.Spider):
name = 'listings'
start_urls = links
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback = self.parse
)
def parse(self, response):
container = response.xpath('//div[#class="l-searchResults"]/div')
for sales in container:
l = ItemLoader(ListingsItem(), selector = sales)
l.add_xpath('address', '//address[#class="propertyCard-address"]/meta[#content]')
l.add_xpath('listings', '//h2[#class="propertyCard-title"]//text()[normalize-space()]')
yield l.load_item()
#self.logger.info('This my first spider')
#address = response.xpath('//*[#id="property-65695633"]/div/div/div[4]/div[1]/div[2]/a/address')
#listings = response.xpath('//h2[#class="propertyCard-title"]')
#for listing in listings:
# yield{
# 'Listing': listing.get()
# }
process = CrawlerProcess(
settings = {
'FEED_URI': 'rightmove.jl',
'FEED_FORMAT': 'jsonlines'
}
)
process.crawl(listingsSpider)
process.start()

Python Scrapy, How to get second image on the page with scrapy?

I only want to extract exact one image on every page that scrapy looking for. For example I want to extract http://eshop.erhanteknik.com.tr/photo/foto_w720_604e44853371a920a52b0a31a3548b8b.jpg from http://eshop.erhanteknik.com.tr/tos_svitavy/tos_svitavy/uc_ayakli_aynalar_t0803?DS7641935 page which scrapy looks first. With this code I am currently get whole images with .getall command but I cannot figure how can get specific image.
from scrapy.http import Request
class BooksSpider(Spider):
name = 'books'
allowed_domains = ['eshop.erhanteknik.com.tr']
start_urls = ['http://eshop.erhanteknik.com.tr/urunlerimiz?categoryId=1']
def parse(self, response):
books = response.xpath('//h3/a/#href').extract()
for book in books:
absolute_url = response.urljoin(book)
yield Request(absolute_url, callback=self.parse_book)
# process next page
next_page_url = response.xpath('//a[#rel="next"]/#href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
yield Request(absolute_next_page_url)
def parse_book(self, response):
title = response.css('h1::text').extract_first()
image_url = response.xpath('//img/#src').getall()
yield {
'title': title,
'image_url': image_url,
}
pass

You need to target the src of the images under the slide class.
image_url = response.css('.slide img::attr(src)').extract_first()
extract_first() will grab the first item of the list.
If you use extract(), you will get a list.

How to do multiple page scraping using Scrapy?

#----\
#-----*-----\
#----/ \
\
#----\ \
#-----*-------- * <-- START
#----/ /
/
#----\ /
#-----*-----/
#----/
Here is a structure of a website I want to scrap with scrapy, where * is a page and --- indicates link. I want to scrape data of # pages.
I have already done a scraper which can scrape data from a single # page.
import scrapy
class MyItem(scrapy.Item):
topic = scrapy.Field()
symptoms = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "medical"
allowed_domains = ['medlineplus.gov']
start_urls = ['https://medlineplus.gov/ency/article/000178.htm']
def parse(self, response):
item = MyItem()
item["topic"] = response.css('h1.with-also::text').extract_first()
item["symptoms"] = response.css("article div#section-2 li::text").extract()
yield item
The starting webpage is https://medlineplus.gov/encyclopedia.html
I want to scrape info about all diseases in the encyclopedia.

You would need to start with the "encyclopedia.html" page, follow the "alpha" links (the A-Z articles links), then, for every followed page, follow the links to the articles.
You can do this with a CrawlSpider and the Link Extractors, but, since the crawling depth is small, we can do this with a regular Spider:
from urlparse import urljoin # Python 2 only
import scrapy
from scrapy.http import Request
class MyItem(scrapy.Item):
topic = scrapy.Field()
symptoms = scrapy.Field()
class MedicalSpider(scrapy.Spider):
name = "medical"
allowed_domains = ['medlineplus.gov']
start_urls = ['https://medlineplus.gov/encyclopedia.html']
def parse(self, response):
for link in response.css("ul.alpha-links li a::attr(href)").extract():
yield Request(urljoin(response.url, link), callback=self.parse_alpha_page)
def parse_alpha_page(self, response):
for link in response.css("ul#index li a::attr(href)").extract():
yield Request(urljoin(response.url, link), callback=self.parse_page)
def parse_page(self, response):
item = MyItem()
item["topic"] = response.css('h1.with-also::text').extract_first()
item["symptoms"] = response.css("article div#section-2 li::text").extract()
yield item
Note that it looks like there is a better way to get the desired data from the MedlinePlus (check out the "For Developers" page).

Combine FormRequest and CrawlSpider

I need to apply FormRequest [From here][1]:
#Request = FormRequest.from_response(
# response,
# formname='frmSearch',
# formdata={'classtype': 'of'},
# #callback=self.parse_links,
# dont_filter=True,
#
# )
For link in start_urls and to all pages that I get from the rules in my СrawlSpider.
class QuokaSpider(CrawlSpider):
name = 'quoka'
allowed_domains = ['www.quoka.de']
start_urls = ['http://www.quoka.de/immobilien/bueros-gewerbeflaechen/']
curr_page = 0
rules = (Rule(LinkExtractor(allow=(r'.+'), restrict_xpaths = [u'//li[#class="arr-rgt active"]',]),
follow=True, callback='parse_links'),
)
def _url(self, url):
return 'http://www.quoka.de' + url
def parse_links(self, response):
hxs = Selector(response)
lnks = hxs.xpath('//a[contains(#class, "img-lmtr") and contains(#class, "multi") or contains(#class, "single")]/#href').extract()
filters = hxs.xpath(u'//div[#class="modal-title"]/text()').extract()
for fil in filters:
print "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"+fil+"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
for url in lnks:
request = Request(self._url(url), callback=self.parse_object)
yield request
def parse_object(self, response):
item = AnbieterItem()
hxs = Selector(response)
item['Beschreibung'] = hxs.xpath(u'//div[#class="text"]/text()').extract()
# item['Kleinanzeigen_App'] = '1'
# item['Preis'] = '1'
return item
If I try to use "start_request" to the filter, the spider does not use pages from the rules.
How can I solve this problem and apply this filter to start url and urls from rules?

I don't know how to combine CrawlSpider Rules with FormRequest but I'd like to suggest that you replace the CrawlSpider with a generic Spider and create the Requests manually.
The Rule in your code does only take care of following the pagination (as far as i can see). To replace that you could use something like in the following code sample:
import scrapy
class TestSpider(scrapy.Spider):
name = 'quoka'
start_urls = ['http://www.quoka.de/immobilien/bueros-gewerbeflaechen']
def parse(self, response):
request = scrapy.FormRequest.from_response(
response,
formname='frmSearch',
formdata={'classtype': 'of'},
callback=self.parse_filtered
)
print request.body
yield request
def parse_filtered(self,response):
resultList = response.xpath('//div[#id="ResultListData"]/ul/li')
for resultRow in resultList:
xpath_Result_Details = './/div[#class="q-col n2"]/a'
# Check if row has details
if resultRow.xpath(xpath_Result_Details):
result_Details = resultRow.xpath(xpath_Result_Details)
# If YES extract details
title = result_Details.xpath('./#title').extract()
href = result_Details.xpath('./#href').extract()[0]
# Code to request detail pages goes here ...
print title, href
# Use this instead of CrawlSpider to follow the pagination links
xpath_NextPage = '//div[#class="rslt-pagination"]//li[#class="arr-rgt active"]/a'
if response.xpath(xpath_NextPage):
nextPage_href = response.xpath(xpath_NextPage + '/#href').extract()[0]
nextPage_url = 'http://www.quoka.de/immobilien/bueros-gewerbeflaechen' + nextPage_href
nextPage_num = response.xpath(xpath_NextPage + '/#data-qng-page').extract()[0]
# request = scrapy.Request(nextPage_url, callback=self.parse_filtered)
# Create request with formdata ...
request = scrapy.FormRequest.from_response(
response,
formname='frmNaviSearch',
formdata={'pageno': nextPage_num},
callback=self.parse_filtered
)
yield request

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Web Scraping all Urls from a website with Scrapy and Python - python

Related

Extracting next page and setting a break

Navigate to new page in Scrapy with the same URL

Python Scrapy, How to get second image on the page with scrapy?

How to do multiple page scraping using Scrapy?

Combine FormRequest and CrawlSpider

Categories

Resources