Python Scrapy scrape data from nested pages

Python Scrapy scrape data from nested pages - python

i have a made a scraper that scrapes data from a website that have the data nested, i mean that to get to the data page i have to click 5 links then i get to the data page where i scrape the data
For every 1st page there are multiple page 2's for every page 2's there are many page 3's and so on
so here i have a parse function for opening each page until i get to the page that has the data and add the data to the item class ad return the item.
But it is skipping a lot of links without scraping data. It is not executing the last parse_link function after 100 or so links*. Well how do i know the parse_link function is not executing ?
it is because i am printing print '\n\n', 'I AM EXECUTED !!!!' and it is not printing after 100 or so links but the code executes parse_then every time
what i want to know is am i doing it right ? is this the right aproch to scrape a website like this
here is the code
# -*- coding: utf-8 -*-
import scrapy
from urlparse import urljoin
from nothing.items import NothingItem
class Canana411Spider(scrapy.Spider):
name = "canana411"
allowed_domains = ["www.canada411.ca"]
start_urls = ['http://www.canada411.ca/']
PAGE 1
def parse(self, response):
SET_SELECTOR = '.c411AlphaLinks.c411NoPrint ul li'
for attr in response.css(SET_SELECTOR):
linkse = 'a ::attr(href)'
link = attr.css(linkse).extract_first()
link = urljoin(response.url, link)
yield scrapy.Request(link, callback=self.parse_next)
PAGE 2
def parse_next(self, response):
SET_SELECTOR = '.clearfix.c411Column.c411Column3 ul li'
for attr in response.css(SET_SELECTOR):
linkse = 'a ::attr(href)'
link = attr.css(linkse).extract_first()
link = urljoin(response.url, link)
yield scrapy.Request(link, callback=self.parse_more)
PAGE 3
def parse_more(self, response):
SET_SELECTOR = '.clearfix.c411Column.c411Column3 ul li'
for attr in response.css(SET_SELECTOR):
linkse = 'a ::attr(href)'
link = attr.css(linkse).extract_first()
link = urljoin(response.url, link)
yield scrapy.Request(link, callback=self.parse_other)
PAGE 4
def parse_other(self, response):
SET_SELECTOR = '.clearfix.c411Column.c411Column3 ul li'
for attr in response.css(SET_SELECTOR):
linkse = 'a ::attr(href)'
link = attr.css(linkse).extract_first()
link = urljoin(response.url, link)
yield scrapy.Request(link, callback=self.parse_then)
PAGE 5
def parse_then(self, response):
SET_SELECTOR = '.c411Cities li h3 a ::attr(href)'
link = response.css(SET_SELECTOR).extract_first()
link = urljoin(response.url, link)
return scrapy.Request(link, callback=self.parse_link)
PAGE 6 THE DATA PAGE
def parse_link(self, response):
print '\n\n', 'I AM EXECUTED !!!!'
item = NothingItem()
namese = '.vcard__name ::text'
addressse = '.c411Address.vcard__address ::text'
phse = 'span.vcard__label ::text'
item['name'] = response.css(namese).extract_first()
item['address'] = response.css(addressse).extract_first()
item['phone'] = response.css(phse).extract_first()
return item
am i doing it right, or is there is a better way that i am missing ?

If there's no conflict (e.g. 1st page cannot contain selectors and links to 3rd and should take into consideration from any page except 2nd or something alike) I'd recommend to flatten rules to extract links. Thus one parse would be enough.

Related

Python Scrapy, How to get second image on the page with scrapy?

I only want to extract exact one image on every page that scrapy looking for. For example I want to extract http://eshop.erhanteknik.com.tr/photo/foto_w720_604e44853371a920a52b0a31a3548b8b.jpg from http://eshop.erhanteknik.com.tr/tos_svitavy/tos_svitavy/uc_ayakli_aynalar_t0803?DS7641935 page which scrapy looks first. With this code I am currently get whole images with .getall command but I cannot figure how can get specific image.
from scrapy.http import Request
class BooksSpider(Spider):
name = 'books'
allowed_domains = ['eshop.erhanteknik.com.tr']
start_urls = ['http://eshop.erhanteknik.com.tr/urunlerimiz?categoryId=1']
def parse(self, response):
books = response.xpath('//h3/a/#href').extract()
for book in books:
absolute_url = response.urljoin(book)
yield Request(absolute_url, callback=self.parse_book)
# process next page
next_page_url = response.xpath('//a[#rel="next"]/#href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
yield Request(absolute_next_page_url)
def parse_book(self, response):
title = response.css('h1::text').extract_first()
image_url = response.xpath('//img/#src').getall()
yield {
'title': title,
'image_url': image_url,
}
pass

You need to target the src of the images under the slide class.
image_url = response.css('.slide img::attr(src)').extract_first()
extract_first() will grab the first item of the list.
If you use extract(), you will get a list.

how to scrape the URL on Scrapy Following Links

I am confused how to scrape the URL itself in following links scrapy.
I do crawling on this page here
import scrapy
from ..items import SkripsiItem
class SkripsiSpiderSpider(scrapy.Spider):
name = 'skripsi'
start_urls = ['https://nasional.sindonews.com/topic/9695/pemilu-2019/']
def parse(self, response):
for href in response.css('.lnk-t a::attr(href)'):
yield response.follow(href, self.parse_author)
for href in response.css('.newpaging li:nth-child(4) a::attr(href)'):
yield response.follow(href, self.parse)
def parse_author(self, response):
items = SkripsiItem()
def extract_with_css(query):
return response.css(query).get(default='').strip()
content = response.xpath(".//div[#class='vidy-embed']/descendant::text()").extract()
items['title'] = extract_with_css('h1::text'),
items['author'] = extract_with_css('.author a::text'),
items['time'] = extract_with_css('time::text'),
items['imagelink'] = extract_with_css('.article img::attr(src)'),
items['content'] = ''.join(content),
yield items
how to scrape every url that is visited at the following link, which is in the code above is .lnk -t a :: attr (href)

Save items['url'] = response.url in the parse_author function.

Web Scraping all Urls from a website with Scrapy and Python

I am writing a web scraper to fetch a group of links
(located at tree.xpath('//div[#class="work_area_content"]/a/#href')
from a website and return the Title and Url of all the leafs sectioned by the leafs parent. I have two scrapers: one in python and one in Scrapy for Python. What is the purpose of callbacks in the Scrapy Request method? Should the information be in a multidimensional or single dimension list ( I believe multi-dimensional but it enhances complication)? Which of the below code is better? If the scraper code is better, how do I migrate the python code to the Scrapy code?
From what I understand from callbacks is that it passes a function's arguments to another function; however, if the callback refers to itself, the data gets overwritten and therefore lost, and you're unable to go back to the root data. Is this correct?
python:
url_storage = [ [ [ [] ] ] ]
page = requests.get('http://1.1.1.1:1234/TestSuites')
tree = html.fromstring(page.content)
urls = tree.xpath('//div[#class="work_area_content"]/a/#href').extract()
i = 0
j = 0
k = 0
for i, url in enumerate(urls):
absolute_url = "".join(['http://1.1.1.1:1234/', url])
url_storage[i][j][k].append(absolute_url)
print(url_storage)
#url_storage.insert(i, absolute_url)
page = requests.get(url_storage[i][j][k])
tree2 = html.fromstring(page.content)
urls2 = tree2.xpath('//div[#class="work_area_content"]/a/#href').extract()
for j, url2 in enumerate(urls2):
absolute_url = "".join(['http://1.1.1.1:1234/', url2])
url_storage[i][j][k].append(absolute_url)
page = requests.get(url_storage[i][j][k])
tree3 = html.fromstring(page.content)
urls3 = tree3.xpath('//div[#class="work_area_content"]/a/#href').extract()
for k, url3 in enumerate(urls3):
absolute_url = "".join(['http://1.1.1.1:1234/', url3])
url_storage[i][j][k].append(absolute_url)
page = requests.get(url_storage[i][j][k])
tree4 = html.fromstring(page.content)
urls3 = tree4.xpath('//div[#class="work_area_content"]/a/#href').extract()
title = tree4.xpath('//span[#class="page_title"]/text()').extract()
yield Request(url_storage[i][j][k], callback=self.end_page_parse_TS, meta={"Title": title, "URL": urls3 })
#yield Request(absolute_url, callback=self.end_page_parse_TC, meta={"Title": title, "URL": urls3 })
def end_page_parse_TS(self, response):
print(response.body)
url = response.meta.get('URL')
title = response.meta.get('Title')
yield{'URL': url, 'Title': title}
def end_page_parse_TC(self, response):
url = response.meta.get('URL')
title = response.meta.get('Title')
description = response.meta.get('Description')
description = response.xpath('//table[#class="wiki_table]/tbody[contains(/td/text(), "description")/parent').extract()
yield{'URL': url, 'Title': title, 'Description':description}
Scrapy:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractor import LinkExtractor
from scrapy.spiders import Rule, CrawlSpider
from datablogger_scraper.items import DatabloggerScraperItem
class DatabloggerSpider(CrawlSpider):
# The name of the spider
name = "datablogger"
# The domains that are allowed (links to other domains are skipped)
allowed_domains = ['http://1.1.1.1:1234/']
# The URLs to start with
start_urls = ['http://1.1.1.1:1234/TestSuites']
# This spider has one rule: extract all (unique and canonicalized) links, follow them and parse them using the parse_items method
rules = [
Rule(
LinkExtractor(
canonicalize=True,
unique=True
),
follow=True,
callback="parse_items"
)
]
# Method which starts the requests by visiting all URLs specified in start_urls
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, callback=self.parse, dont_filter=True)
# Method for parsing items
def parse_items(self, response):
# The list of items that are found on the particular page
items = []
# Only extract canonicalized and unique links (with respect to the current page)
links = LinkExtractor(canonicalize=True, unique=True).extract_links(response)
# Now go through all the found links
item = DatabloggerScraperItem()
item['url_from'] = response.url
for link in links:
item['url_to'] = link.url
items.append(item)
# Return all the found items
return items

scrapy crawl a set of links that might contains next pages

I want to:
Extract links for a certain page
For each link, I need some contents for that link, and the contents of 'next pages' of that link.
Then export it as json file(not important as far as I think regarding my problem)
Currently my spider is like this:
class mySpider(scrapy.Spider):
...
def parse(self, response):
for url in someurls:
yield scrapy.Request(url=url, callback=self.parse_next)
def parse_next(self, response):
for selector in someselectors:
yield { 'contents':...,
...}
nextPage = obtainNextPage()
if nextPage:
yield scrapy.Request(url=next_url, callback=self.parse_next)
The problem is for a set of links that the spider processed, the spider could only reach 'next page' for the last link of that set of links, I viewed that through selenium + chromedriver. For example, I have 10 links(from No.1 to No.10), my spider could only get the next pages for the No.10 link. I don't know if the problem occurred was because of some structural problem of my spider. Below is the full code:
import scrapy
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
class BaiduSpider(scrapy.Spider):
name = 'baidu'
allowed_domains = ['baidu.com']
start_urls = ['http://tieba.baidu.com']
main_url = 'http://tieba.baidu.com/f?kw=%E5%B4%94%E6%B0%B8%E5%85%83&ie=utf-8'
username = ""
password = ""
def __init__(self, username=username, password=password):
#options = webdriver.ChromeOptions()
#options.add_argument('headless')
#options.add_argument('window-size=1200x600')
self.driver = webdriver.Chrome()#chrome_options=options)
self.username = username
self.password = password
# checked
def logIn(self):
elem = self.driver.find_element_by_css_selector('#com_userbar > ul > li.u_login > div > a')
elem.click()
wait = WebDriverWait(self.driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,'#TANGRAM__PSP_10__footerULoginBtn')))
elem = self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__footerULoginBtn')
elem.click()
elem = self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__userName')
elem.send_keys(self.username)
elem = self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__password')
elem.send_keys(self.password)
self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__submit').click()
# basic checked
def parse(self, response):
self.driver.get(response.url)
self.logIn()
# wait for hand input verify code
time.sleep(15)
self.driver.get('http://tieba.baidu.com/f?kw=%E5%B4%94%E6%B0%B8%E5%85%83&ie=utf-8')
for url in self.driver.find_elements_by_css_selector('a.j_th_tit')[:2]:
#new_url = response.urljoin(url)
new_url = url.get_attribute("href")
yield scrapy.Request(url=new_url, callback=self.parse_next)
# checked
def pageScroll(self, url):
self.driver.get(url)
SCROLL_PAUSE_TIME = 0.5
SCROLL_LENGTH = 1200
page_height = int(self.driver.execute_script("return document.body.scrollHeight"))
scrollPosition = 0
while scrollPosition < page_height:
scrollPosition = scrollPosition + SCROLL_LENGTH
self.driver.execute_script("window.scrollTo(0, " + str(scrollPosition) + ");")
time.sleep(SCROLL_PAUSE_TIME)
time.sleep(1.2)
def parse_next(self, response):
self.log('I visited ' + response.url)
self.pageScroll(response.url)
for sel in self.driver.find_elements_by_css_selector('div.l_post.j_l_post.l_post_bright'):
name = sel.find_element_by_css_selector('.d_name').text
try:
content = sel.find_element_by_css_selector('.j_d_post_content').text
except: content = ''
try: reply = sel.find_element_by_css_selector('ul.j_lzl_m_w').text
except: reply = ''
yield {'name': name, 'content': content, 'reply': reply}
#follow to next page
next_sel = self.driver.find_element_by_link_text("下一页")
next_url_name = next_sel.text
if next_sel and next_url_name == '下一页':
next_url = next_sel.get_attribute('href')
yield scrapy.Request(url=next_url, callback=self.parse_next)
Thanks for your help, and welcome any suggestions referring my code above

In reference to scraping content from one page, store it, and allow the spider to continue the crawl to the scrape and store items on subsequent pages. You should be configuring your items.py file with the item names and pass the items through each scrapy.Request using a meta.
You should check out https://github.com/scrapy/scrapy/issues/1138
To illustrate how this works, it goes something like this...
1. First, we set up the item.py file with the total items to be scraped on every page.
#items.py
import scrapy
class ScrapyProjectItem(scrapy.Item):
page_one_item = scrapy.Field()
page_two_item = scrapy.Field()
page_three_item = scrapy.Field()
Then its importing the items.py item class to you scrapy spider.
from scrapyproject.items import ScrapyProjectItem
The in your scraper, through each page iteration that has content you want, its initializing the items.py class the pass the items using 'meta' to the next request.
#spider.py
def parse(self, response):
# Initializing the item class
item = ScrapyProjectItem()
# Itemizing the... item lol
item['page_one_item'] = response.css("etcetc::").extract() # set desired attribute
# Here we pass the items to the next concurrent request
for url in someurls: # Theres a million ways to skin a cat, dont know your exact use case.
yield scrapy.Request(response.urljoin(url),
callback=self.parse_next, meta={'item': item})
def parse_next(self, response):
# We load the meta from the previous request
item = response.meta['item']
# We itemize
item['page_two_item'] = response.css("etcetc::").extract()
# We pass meta again to next request
for url in someurls:
yield scrapy.Request(response.urljoin(url),
callback=self.parse_again, meta={'item': item})
def parse_again(self, response):
# We load the meta from the previous request
item = response.meta['item']
# We itemize
item['page_three_item'] = response.css("etcetc::").extract()
# We pass meta again to next request
for url in someurls:
yield scrapy.Request(response.urljoin(url),
callback=self.parse_again, meta={'item': item})
# At the end of each iteration of the crawl loop we can yield the result
yield item
As to the problem about crawler only reaching the last link, I would like to have more info instead of guessing what the problem could be. In your "parse_next", you should add a "print(response.url)" to see if the pages are being reached at all? Im sorry if I didnt understand your problem and wasted everyones time lol.
EDIT
I think I understand better you issue ... You have a list of urls, and each urls has its own set of urls yes?
In your code, the "obtainNextPage()" might be the issue? I have in the past when encountering this type of case have had to use some xpath and/or regex magic to properly obtain the next pages. Im not sure what "obtainNextPage" is doing but... have you thought of parsing the content and use selector to find the next page?? For example.
class mySpider(scrapy.Spider):
...
def parse(self, response):
for url in someurls:
yield scrapy.Request(url=url, callback=self.parse_next)
def parse_next(self, response):
for selector in someselectors:
yield { 'contents':...,
...}
#nextPage = obtainNextPage()
next_page = response.xpath('//path/to/nextbutton/orPage'):
if next_page is not None:
yield scrapy.Request(response.urljoin(next_page),
callback=self.parse_next)
You should still add that "print(response.url)" to see if the url thats being requested is being called correctly, might be urljoin issue.

How can i make scrapy to process the url sequentially

I have this code
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="headline_area"]')
items = []
for site in sites[:5]:
item = StackItem()
log.msg(' LOOP' +str(ivar)+ '', level=log.ERROR)
item['title'] ="yoo ma"
request = Request("blabla", callback=self.test1)
request.meta['item'] = item
page_number = nextlink.split("&")[-1].split("=")[-1]
if int(page_number) > 500:
raise CloseSpider('Search Exceeded 500')
ivar = ivar + 1
yield request
mylinks= soup.find_all('a')
if mylinks:
nextlink = mylinks[0].get('href')
page_number = nextlink.split("&")[-3].split("=")[-1]
request = Request(urljoin(response.url, nextlink), callback=self.parse)
request.meta['page'] = page_number
yield request
Now my problem is that suppose i want to stop at page_number = 5
now scrappy goes to that page before the all items from page 1 , page 2 etc are downloaded and stops when it first reaches there.
How can get rid of that porblem that it prcess all links before going to page = 5

Does the link has some regularity on different page? For example, if the 5th page's link is www.xxxx.net/nForum/#!article/Bet/447540?p=5. You can scrappy link with p=5 directly.

You can use the inline_requests decorator.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python Scrapy scrape data from nested pages - python

If there's no conflict (e.g. 1st page cannot contain selectors and links to 3rd and should take into consideration from any page except 2nd or something alike) I'd recommend to flatten rules to extract links. Thus one parse would be enough.

Related

Python Scrapy, How to get second image on the page with scrapy?

how to scrape the URL on Scrapy Following Links

Web Scraping all Urls from a website with Scrapy and Python

scrapy crawl a set of links that might contains next pages

How can i make scrapy to process the url sequentially

Categories

Resources