After adding kward and second perse scrapy stopped working - python

after adding kward script stopped to output any scraped data, it only outputed normal spider debug data. I have completly no idea why the hell it does that,
it looks like whole parseMain is just sittin there and doin nothing.
Here is my code:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy import Request, Spider
class RynekMainSpider(scrapy.Spider):
name = "RynekMain"
start_urls = [
'https://rynekpierwotny.pl/deweloperzy/?page=1']
def parse(self, response):
websites = response.css('div.root')
for websitep in websites:
websiteurl = websitep.css('div.rp-l0pkv6 a::attr(href)').get()
href = websitep.css('li.rp-np9kb1 a::attr(href)').get()
url = response.urljoin(href)
yield Request(url, cb_kwargs={'websiteurl': websiteurl}, callback=self.parseMain)
def parseMain(self, response, websiteurl):
# def parse(self, response):
for quote in response.css('.rp-y89gny.eboilu01 ul li'):
address = quote.css('address.rp-o9b83y::text').get(),
name = quote.css('h2.rp-69f2r4::text').get(),
href = quote.css('li.rp-np9kb1 a::attr(href)').get(),
PAGETEST = response.css('a.rp-mmikj9::attr(href)').get()
yield {
'address' : address,
'name' : name,
'href' : href,
'PAGETEST' : PAGETEST,
'websiteurl' : websiteurl
}
next_page=response.css('a.rp-mmikj9::attr(href)').get()
if next_page is not None:
next_page_link=response.urljoin(next_page)
yield scrapy.Request(url=next_page_link, callback= self.parse)
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(RynekMainSpider)
process.start()
Thanks for help in advance.
EDIT: Oh shoot i forgot to tell what my code is supposed to do.
Basicly parse is getting website url from inside of subPages like "https://rynekpierwotny.pl/deweloperzy/dom-development-sa-955/".
While parseMain is getting all data(like address,name) from main page "https://rynekpierwotny.pl/deweloperzy/?page=1".
# -*- coding: utf-8 -*-
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy import Request, Spider
class RynekMainSpider(scrapy.Spider):
name = "RynekMain"
start_urls = [
'https://rynekpierwotny.pl/deweloperzy/?page=1']
def parse(self, response):
for quote in response.css('.rp-y89gny.eboilu01 ul li'):
yield {
'address' : quote.css('address.rp-o9b83y::text').get(),
'name' : quote.css('h2.rp-69f2r4::text').get(),
'href' : quote.css('li.rp-np9kb1 a::attr(href)').get(),
'PAGETEST' : response.css('a.rp-mmikj9::attr(href)').get()
}
next_page=response.css('a.rp-mmikj9::attr(href)').get()
if next_page is not None:
next_page_link=response.urljoin(next_page)
yield scrapy.Request(url=next_page_link, callback= self.parse)
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(RynekMainSpider)
process.start()
This worked

Edit:
I made some further adjustments based on your notes of what you want to program to do. It should work the way you expect now.
try this instead:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy import Request, Spider
class RynekMainSpider(scrapy.Spider):
name = "RynekMain"
start_urls = [
'https://rynekpierwotny.pl/deweloperzy/?page=1']
def parse(self, response):
websites = response.css('div#root')[0]
PAGETEST = response.xpath('//a[contains(#class,"rp-173nt6g")]/../following-sibling::li').css('a::attr(href)').get()
for website in websites.css('li.rp-np9kb1'):
page = website.css('a::attr(href)').get()
address = website.css('address.rp-o9b83y::text').get()
name = website.css('h2.rp-69f2r4::text').get()
params = {
'address' : address,
'name' : name,
'href' : page,
}
url = response.urljoin(page)
yield Request(url=url, cb_kwargs={'params': params}, callback=self.parseMain)
yield Request(url=response.urljoin(PAGETEST), callback=self.parse)
def parseMain(self, response, params=None):
# print(response.url)
website = response.css('div.rp-l0pkv6 a::attr(href)').get()
params['website'] = website
yield params
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(RynekMainSpider)
process.start()

Related

Scrapy tracking and scraping third page

after trying to add third page to this shenanigas i got an error "You can't mix str and non-str arguments". My goal is to use url from 'website' and scrap data from it. How do i do it?
Here is my code:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy import Request, Spider
class RynekMainSpider(scrapy.Spider):
name = "RynekMain"
start_urls = [
'https://rynekpierwotny.pl/deweloperzy/?page=1']
def parse(self, response):
websites = response.css('div#root')[0]
PAGETEST = response.xpath('//a[contains(#class,"rp-173nt6g")]/../following-sibling::li').css('a::attr(href)').get()
for website in websites.css('li.rp-np9kb1'):
page = website.css('a::attr(href)').get()
address = website.css('address.rp-o9b83y::text').get()
name = website.css('h2.rp-69f2r4::text').get()
params = {
'address' : address,
'name' : name,
'href' : page,
}
url = response.urljoin(page)
urlem = response.urljoin(website)
yield Request(url=url, cb_kwargs={'params': params}, callback=self.parseMain)
yield Request(url=urlem, cb_kwargs={'params': params}, callback=self.parseEmail)
yield Request(url=response.urljoin(PAGETEST), callback=self.parse)
def parseMain(self, response, params=None):
# print(response.url)
website = response.css('div.rp-l0pkv6 a::attr(href)').get()
params['website'] = website
yield params
def parseEmail(self,response, params=None):
hps = HtmlXPathSelector(response)
email = hxs.xpath('//body').re('([a-zA-Z0-9_.+-]+#[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)')
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(RynekMainSpider)
process.start()
Thanks for help in advance.
A simple debugging pointed me to the error line:
urlem = response.urljoin(website) # You can't mix str and non-str arguments
website is a Selector, and urljoin needs a string.
Perhaps what you are looking for is this:
urlem = response.urljoin(website.xpath('.//a/#href').get())
Ok i solved it.
I just moved yield a bit.
Yield can't just take non existent strings, string needs to be created first,
that's why i got problems before.
Website url was scraped in parseMain not in parse.
# -*- coding: utf-8 -*-
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy import Request, Spider
class RynekMainSpider(scrapy.Spider):
name = "RynekMain"
start_urls = [
'https://rynekpierwotny.pl/deweloperzy/?page=1']
def parse(self, response):
websites = response.css('div#root')[0]
PAGETEST = response.xpath('//a[contains(#class,"rp-173nt6g")]/../following-sibling::li').css('a::attr(href)').get()
for website in websites.css('li.rp-np9kb1'):
page = website.css('a::attr(href)').get()
address = website.css('address.rp-o9b83y::text').get()
name = website.css('h2.rp-69f2r4::text').get()
params = {
'address' : address,
'name' : name,
'href' : page,
}
url = response.urljoin(page)
yield Request(url=url, cb_kwargs={'params': params}, callback=self.parseMain)
yield Request(url=response.urljoin(PAGETEST), callback=self.parse)
def parseMain(self, response, params=None):
# print(response.url)
website = response.css('div.rp-l0pkv6 a::attr(href)').get()
params['website'] = website
urlem = response.urljoin(website)
yield Request(url=urlem, cb_kwargs={'params': params}, callback=self.parseEmail)
def parseEmail(self,response, params=None):
email = response.css('div.m-Footer__company a::attr(href)').get()
params['email'] = email
yield params
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(RynekMainSpider)
process.start()

Scrapy correct form data but cant login

I already managed on some webpages to successfully log in and start scraping data, but i noticed, that i got some big problems with other pages.
I checked in GoogleChrome the form data and there is only username and password to fill in, so i did it in my code, but i cant login...
My Code
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import DuifloginItem
from scrapy.http import Request, FormRequest
import csv
class DuifLogin(CrawlSpider):
name = "duiflogin"
allowed_domains = ['duif.nl']
login_page = 'https://www.duif.nl/login'
custom_settings = {'FEED_EXPORT_FIELDS' : ['SKU', 'Title', 'Price', 'Link', 'Title_small', 'NL_PL_PC', 'Description' ] }
with open("duifonlylinks.csv","r") as f:
reader = csv.DictReader(f)
start_urls = [items['Link'] for items in reader]
rules = (
Rule(
LinkExtractor(),
callback='parse_page',
follow=True
),
)
def start_requests(self):
yield Request(
url=self.login_page,
callback=self.parse,
dont_filter=True
)
def parse(self, response):
return FormRequest.from_response(response,formdata={
'username' : 'not real',
'password' : 'login data',
}, callback=self.after_loging)
def after_loging(self, response):
accview = response.xpath('//div[#class="c-accountbox clearfix js-match-height"]/h3')
if accview:
print('success')
else:
print(':(')
for url in self.start_urls:
yield response.follow(url=url, callback=self.parse_page)
def parse_page(self, response):
productpage = response.xpath('//div[#class="product-details col-md-12"]')
if not productpage:
print('No product', response.url)
for a in productpage:
items = DuifloginItem()
items['Link'] = response.url
items['SKU'] = response.xpath('//p[#class="desc"]/text()').get().strip()
items['Price'] = response.xpath('//span[#class="regular"]/text()').get()
items['Title'] = response.xpath('//h1[#class="product-title"]/text()').get()
items['Title_small'] = response.xpath('//div[#class="left"]/p/text()').get()
items['NL_PL_PC'] = response.xpath('//div[#class="desc"]/ul/li/em/text()').getall()
items['Description'] = response.xpath('//div[#class="item"]/p/text()').getall()
yield items
Here you can see form data on the 302 POST:
Do i miss cookies? If yes, i cant find any on the whole domain...

How to pass data from one Class to Scrapy Class

I need to pass the URL, username, and password from one class to Scrapy Class to perform web scraping.
import quotes as q
import scrapy
from scrapy.crawler import CrawlerProcess
class ValidateURL:
def checkURL(self,urls):
try:
if(urls):
for key, value in urls.items():
if value['login_details']:
self.runScrap(value)
except:
return False
def runScrap(self,data):
if data:
''' data= "url_4": {
"url": ("https://quotes.toscrape.com/login",),
"fields_in_response": ["Quotes to Scrape","Login"],
"login_details": {"name":"foobar","pwd":"foobar" },
"fields_in_main_page": ["Quotes to Scrape","Top Ten tags"]
}
'''
process = CrawlerProcess()
process.crawl(q.QuotesSpider, start_urls=data['url'])
process.start()
And The scrapy class is
# -*- coding: utf-8 -*-
from scrapy import Spider
from scrapy.http import FormRequest
from scrapy.utils.response import open_in_browser
import sys
import logging
from bs4 import BeautifulSoup
# import scrapy
# from scrapy.crawler import CrawlerProcess
logging.basicConfig(filename='app.log',level=logging.INFO)
class QuotesSpider(Spider):
name = 'quotes'
start_urls = ('https://quotes.toscrape.com/login',)
def parse(self, response):
# print(self.req['url'])
print('/'*100)
self.start_urls=self.login_url
# print(type(self.login_url))
inputs =response.xpath('//form//input').extract()
soup_dict={}
for key,i in enumerate(inputs):
soup = BeautifulSoup(i, 'html.parser')
inp_type = soup.input['type'] if soup.input.has_attr('type') else None
inp_value = soup.input['value'] if soup.input.has_attr('value') else None
inp_name = soup.input['name'] if soup.input.has_attr('name') else None
soup_dict[key]= {'name':inp_name,'value':inp_value,'type':inp_type}
token = response.xpath('//*[#name="csrf_token"]/#value').extract_first()
return FormRequest.from_response(response,
formdata={'csrf_token': token,
'password': 'foobar',
'username': 'foobar'},
callback=self.scrape_pages)
def fetch_form_data(self,response):
if all(field in response for field in self.fields_in_response):
inputs =response.xpath('//form//input').extract()
soup_dict={}
for key,i in enumerate(inputs):
soup = BeautifulSoup(i, 'html.parser')
inp_type = soup.input['type'] if soup.input.has_attr('type') else None
inp_value = soup.input['value'] if soup.input.has_attr('value') else None
inp_name = soup.input['name'] if soup.input.has_attr('name') else None
soup_dict[key]= {'name':inp_name,'value':inp_value,'type':inp_type}
def scrape_pages(self, response):
open_in_browser(response)
# Complete your code here to scrape the pages that you are redirected to after logging in
# ....
# ....
However, I'm not able to update the class variable start_urls. with the passed variable from ValidateURL Class. I tried with init in the QuotesSpider class but that didn't work. Actually start_urls is a class member of BaseClass(Spider). Could some please help me to know how to update the class variable of baseclass
Could someone suggest what is missing
You can pass parameters to the Spider from crawl command like this
process.crawl(q.QuotesSpider, first='James', last='Bond')

Crawl iframe and page at the same time

I just wanted to know if it's possible to crawl a page on a website and extract data from this page and from an iframe in this page at the same time?
I'm using scrapy with python and I already know how to extract data from the iframe...
Thank you for your help!!
Thanks to your answer, I made this... But I don't know what to put instead of 'url'... Can you help me again please?
# -*- coding: utf-8 -*-
import scrapy
import re
import numbers
from fnac.items import FnacItem
from urllib.request import urlopen
# from scrapy.spiders import CrawlSpider, Rule
# from scrapy.linkextractors import LinkExtractor
from bs4 import BeautifulSoup
class Fnac(CrawlSpider): #scrapy.Spider
name = 'FnacCom'
allowed_domains = ['fnac.com']
start_urls = ['http://www.fnac.com/MORMANE/srefA5533119-3387-5EC4-82B6-AA61216BF599']
##### To extract links in order to run the spider in them
# rules = (
# Rule(LinkExtractor(allow=()), callback='parse'),
# )
def parse(self, response):
soup = BeautifulSoup(urlopen(response.url), "lxml")
iframexx = soup.find_all('iframe')
for iframe in iframexx:
yield scrapy.Request(iframe.attrs['src'],callback=self.parse2)
##### Main function
def parse1(self, response):
item1 = FnacItem()
nb_sales = response.xpath('//table[#summary="données détaillée du vendeur"]/tbody/tr/td/span/text()').extract()
country = response.xpath('//table[#summary="données détaillée du vendeur"]/tbody/tr/td/text()').extract()
yield scrapy.Request(url, meta={'item': item1}) #I don't know what to put instead of URL...
def parse2(self, response):
same_item = response.meta['item']
address = response.xpath('//div/p/text()').re(r'.*Adresse \: (.*)\n?.*')
email = response.xpath('//div/ul/li[contains(text(),"#")]/text()').extract()
name = response.xpath('//div/p[#class="customer-policy-label"]/text()').re(r'Infos sur la boutique \: ([a-zA-Z0-9]*)')
phone = response.xpath('//div/p/text()').re(r'.*Tél \: ([\d]*)\n?.*')
siret = response.xpath('//div/p/text()').re(r'.*Siret \: ([\d]*)\n?.*')
vat = response.xpath('//div/text()').re(r'.*TVA \: (.*)')
if (len(name) != 0):
item['name'] = ''.join(name).strip()
item['address'] = ''.join(address).strip()
item['phone'] = ''.join(phone).strip()
item['email'] = ''.join(email).strip()
item['nb_sales'] = ''.join(nb_sales).strip()
item['country'] = ''.join(country).strip()
item['vat'] = ''.join(vat).strip()
item['siret'] = ''.join(siret).strip()
return item
to combine information from different requests into a similar item, you have to use the meta parameter of the requests:
def parse1(self, response):
item1 = {
...
}
yield Request(url='another_url.com', meta={'item': item1}, callback=self.parse2)
def parse2(self, response):
same_item = response.meta['item']
# keep populating the item with the second response
...
yield same_item

scrapy SgmlLinkExtractor scrape Master and Detail pages

I am trying to extract information from Listing and Detail pages.
The code below correctly scrapes the reviewer information from the Listing page and all linked pages (where a contains Next)
The detail_pages Urls are also captured. e.g. http://www.screwfix.com/p/prysmian-6242y-twin-earth-cable-2-5mm-x-100m-grey/20967
However I cannot see how I can navigate to and scrape the information from the Detail pages.
Is there anyone here who used Scrapy successfully who can help me to finish this spider?
Thank you for the help.
I include the code for the spider below:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
from scrapy.spider import Spider
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from hn_scraper.items import HnArticleItem
class ScrewfixSpider(Spider):
name = "Screwfix"
allowed_domains = ["www.screwfix.com"]
start_urls = ('http://www.screwfix.com/', )
link_extractor = SgmlLinkExtractor(
allow=('www', ),
restrict_xpaths=('//a[contains(., "Next")]', ))
detail_page_extractor = SgmlLinkExtractor(
allow=('www', ),
restrict_xpaths=('//tr[#id[contains(., "reviewer")]]/td[3]/a', ))
def extract_one(self, selector, xpath, default=None):
extracted = selector.xpath(xpath).extract()
if extracted:
return extracted[0]
return default
def parse(self, response):
for link in self.link_extractor.extract_links(response):
request = Request(url=link.url)
request.meta.update(link_text=link.text)
yield request
for item in self.parse_item(response):
yield item
def parse_item(self, response):
selector = Selector(response)
rows = selector.xpath('//table[contains(.,"crDataGrid")]//tr[#id[contains(., "reviewer")]]')
for row in rows:
item = HnArticleItem()
reviewer = row.xpath('td[3]/a')
reviewer_url = self.extract_one(reviewer, './#href', '')
reviewer_name = self.extract_one(reviewer, 'b/text()', '')
total_reviews = row.xpath('td[4]/text()').extract()
item['url'] = reviewer_url
item['name'] = reviewer_name
item['total_reviews'] = total_reviews
yield item
detail_pages = self.detail_page_extractor.extract_links(response)
if detail_pages:
print 'detail_pages'
print detail_pages[0].url
yield Request(detail_pages[0].url)

Categories

Resources