I need to pass the URL, username, and password from one class to Scrapy Class to perform web scraping.
import quotes as q
import scrapy
from scrapy.crawler import CrawlerProcess
class ValidateURL:
def checkURL(self,urls):
try:
if(urls):
for key, value in urls.items():
if value['login_details']:
self.runScrap(value)
except:
return False
def runScrap(self,data):
if data:
''' data= "url_4": {
"url": ("https://quotes.toscrape.com/login",),
"fields_in_response": ["Quotes to Scrape","Login"],
"login_details": {"name":"foobar","pwd":"foobar" },
"fields_in_main_page": ["Quotes to Scrape","Top Ten tags"]
}
'''
process = CrawlerProcess()
process.crawl(q.QuotesSpider, start_urls=data['url'])
process.start()
And The scrapy class is
# -*- coding: utf-8 -*-
from scrapy import Spider
from scrapy.http import FormRequest
from scrapy.utils.response import open_in_browser
import sys
import logging
from bs4 import BeautifulSoup
# import scrapy
# from scrapy.crawler import CrawlerProcess
logging.basicConfig(filename='app.log',level=logging.INFO)
class QuotesSpider(Spider):
name = 'quotes'
start_urls = ('https://quotes.toscrape.com/login',)
def parse(self, response):
# print(self.req['url'])
print('/'*100)
self.start_urls=self.login_url
# print(type(self.login_url))
inputs =response.xpath('//form//input').extract()
soup_dict={}
for key,i in enumerate(inputs):
soup = BeautifulSoup(i, 'html.parser')
inp_type = soup.input['type'] if soup.input.has_attr('type') else None
inp_value = soup.input['value'] if soup.input.has_attr('value') else None
inp_name = soup.input['name'] if soup.input.has_attr('name') else None
soup_dict[key]= {'name':inp_name,'value':inp_value,'type':inp_type}
token = response.xpath('//*[#name="csrf_token"]/#value').extract_first()
return FormRequest.from_response(response,
formdata={'csrf_token': token,
'password': 'foobar',
'username': 'foobar'},
callback=self.scrape_pages)
def fetch_form_data(self,response):
if all(field in response for field in self.fields_in_response):
inputs =response.xpath('//form//input').extract()
soup_dict={}
for key,i in enumerate(inputs):
soup = BeautifulSoup(i, 'html.parser')
inp_type = soup.input['type'] if soup.input.has_attr('type') else None
inp_value = soup.input['value'] if soup.input.has_attr('value') else None
inp_name = soup.input['name'] if soup.input.has_attr('name') else None
soup_dict[key]= {'name':inp_name,'value':inp_value,'type':inp_type}
def scrape_pages(self, response):
open_in_browser(response)
# Complete your code here to scrape the pages that you are redirected to after logging in
# ....
# ....
However, I'm not able to update the class variable start_urls. with the passed variable from ValidateURL Class. I tried with init in the QuotesSpider class but that didn't work. Actually start_urls is a class member of BaseClass(Spider). Could some please help me to know how to update the class variable of baseclass
Could someone suggest what is missing
You can pass parameters to the Spider from crawl command like this
process.crawl(q.QuotesSpider, first='James', last='Bond')
Related
after adding kward script stopped to output any scraped data, it only outputed normal spider debug data. I have completly no idea why the hell it does that,
it looks like whole parseMain is just sittin there and doin nothing.
Here is my code:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy import Request, Spider
class RynekMainSpider(scrapy.Spider):
name = "RynekMain"
start_urls = [
'https://rynekpierwotny.pl/deweloperzy/?page=1']
def parse(self, response):
websites = response.css('div.root')
for websitep in websites:
websiteurl = websitep.css('div.rp-l0pkv6 a::attr(href)').get()
href = websitep.css('li.rp-np9kb1 a::attr(href)').get()
url = response.urljoin(href)
yield Request(url, cb_kwargs={'websiteurl': websiteurl}, callback=self.parseMain)
def parseMain(self, response, websiteurl):
# def parse(self, response):
for quote in response.css('.rp-y89gny.eboilu01 ul li'):
address = quote.css('address.rp-o9b83y::text').get(),
name = quote.css('h2.rp-69f2r4::text').get(),
href = quote.css('li.rp-np9kb1 a::attr(href)').get(),
PAGETEST = response.css('a.rp-mmikj9::attr(href)').get()
yield {
'address' : address,
'name' : name,
'href' : href,
'PAGETEST' : PAGETEST,
'websiteurl' : websiteurl
}
next_page=response.css('a.rp-mmikj9::attr(href)').get()
if next_page is not None:
next_page_link=response.urljoin(next_page)
yield scrapy.Request(url=next_page_link, callback= self.parse)
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(RynekMainSpider)
process.start()
Thanks for help in advance.
EDIT: Oh shoot i forgot to tell what my code is supposed to do.
Basicly parse is getting website url from inside of subPages like "https://rynekpierwotny.pl/deweloperzy/dom-development-sa-955/".
While parseMain is getting all data(like address,name) from main page "https://rynekpierwotny.pl/deweloperzy/?page=1".
# -*- coding: utf-8 -*-
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy import Request, Spider
class RynekMainSpider(scrapy.Spider):
name = "RynekMain"
start_urls = [
'https://rynekpierwotny.pl/deweloperzy/?page=1']
def parse(self, response):
for quote in response.css('.rp-y89gny.eboilu01 ul li'):
yield {
'address' : quote.css('address.rp-o9b83y::text').get(),
'name' : quote.css('h2.rp-69f2r4::text').get(),
'href' : quote.css('li.rp-np9kb1 a::attr(href)').get(),
'PAGETEST' : response.css('a.rp-mmikj9::attr(href)').get()
}
next_page=response.css('a.rp-mmikj9::attr(href)').get()
if next_page is not None:
next_page_link=response.urljoin(next_page)
yield scrapy.Request(url=next_page_link, callback= self.parse)
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(RynekMainSpider)
process.start()
This worked
Edit:
I made some further adjustments based on your notes of what you want to program to do. It should work the way you expect now.
try this instead:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy import Request, Spider
class RynekMainSpider(scrapy.Spider):
name = "RynekMain"
start_urls = [
'https://rynekpierwotny.pl/deweloperzy/?page=1']
def parse(self, response):
websites = response.css('div#root')[0]
PAGETEST = response.xpath('//a[contains(#class,"rp-173nt6g")]/../following-sibling::li').css('a::attr(href)').get()
for website in websites.css('li.rp-np9kb1'):
page = website.css('a::attr(href)').get()
address = website.css('address.rp-o9b83y::text').get()
name = website.css('h2.rp-69f2r4::text').get()
params = {
'address' : address,
'name' : name,
'href' : page,
}
url = response.urljoin(page)
yield Request(url=url, cb_kwargs={'params': params}, callback=self.parseMain)
yield Request(url=response.urljoin(PAGETEST), callback=self.parse)
def parseMain(self, response, params=None):
# print(response.url)
website = response.css('div.rp-l0pkv6 a::attr(href)').get()
params['website'] = website
yield params
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(RynekMainSpider)
process.start()
after trying to add third page to this shenanigas i got an error "You can't mix str and non-str arguments". My goal is to use url from 'website' and scrap data from it. How do i do it?
Here is my code:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy import Request, Spider
class RynekMainSpider(scrapy.Spider):
name = "RynekMain"
start_urls = [
'https://rynekpierwotny.pl/deweloperzy/?page=1']
def parse(self, response):
websites = response.css('div#root')[0]
PAGETEST = response.xpath('//a[contains(#class,"rp-173nt6g")]/../following-sibling::li').css('a::attr(href)').get()
for website in websites.css('li.rp-np9kb1'):
page = website.css('a::attr(href)').get()
address = website.css('address.rp-o9b83y::text').get()
name = website.css('h2.rp-69f2r4::text').get()
params = {
'address' : address,
'name' : name,
'href' : page,
}
url = response.urljoin(page)
urlem = response.urljoin(website)
yield Request(url=url, cb_kwargs={'params': params}, callback=self.parseMain)
yield Request(url=urlem, cb_kwargs={'params': params}, callback=self.parseEmail)
yield Request(url=response.urljoin(PAGETEST), callback=self.parse)
def parseMain(self, response, params=None):
# print(response.url)
website = response.css('div.rp-l0pkv6 a::attr(href)').get()
params['website'] = website
yield params
def parseEmail(self,response, params=None):
hps = HtmlXPathSelector(response)
email = hxs.xpath('//body').re('([a-zA-Z0-9_.+-]+#[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)')
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(RynekMainSpider)
process.start()
Thanks for help in advance.
A simple debugging pointed me to the error line:
urlem = response.urljoin(website) # You can't mix str and non-str arguments
website is a Selector, and urljoin needs a string.
Perhaps what you are looking for is this:
urlem = response.urljoin(website.xpath('.//a/#href').get())
Ok i solved it.
I just moved yield a bit.
Yield can't just take non existent strings, string needs to be created first,
that's why i got problems before.
Website url was scraped in parseMain not in parse.
# -*- coding: utf-8 -*-
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy import Request, Spider
class RynekMainSpider(scrapy.Spider):
name = "RynekMain"
start_urls = [
'https://rynekpierwotny.pl/deweloperzy/?page=1']
def parse(self, response):
websites = response.css('div#root')[0]
PAGETEST = response.xpath('//a[contains(#class,"rp-173nt6g")]/../following-sibling::li').css('a::attr(href)').get()
for website in websites.css('li.rp-np9kb1'):
page = website.css('a::attr(href)').get()
address = website.css('address.rp-o9b83y::text').get()
name = website.css('h2.rp-69f2r4::text').get()
params = {
'address' : address,
'name' : name,
'href' : page,
}
url = response.urljoin(page)
yield Request(url=url, cb_kwargs={'params': params}, callback=self.parseMain)
yield Request(url=response.urljoin(PAGETEST), callback=self.parse)
def parseMain(self, response, params=None):
# print(response.url)
website = response.css('div.rp-l0pkv6 a::attr(href)').get()
params['website'] = website
urlem = response.urljoin(website)
yield Request(url=urlem, cb_kwargs={'params': params}, callback=self.parseEmail)
def parseEmail(self,response, params=None):
email = response.css('div.m-Footer__company a::attr(href)').get()
params['email'] = email
yield params
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(RynekMainSpider)
process.start()
I believe that I have my xpaths coded in the incorrect way, as I only get a single result for each url. Whereas, there are in total 25 job posts for each url (not included those in the next page.) How can I correct my xpaths to get all the results?
Here's my scraper:
from scrapy.item import Field
import scrapy
from scrapy.loader import ItemLoader
from scrapy.crawler import CrawlerProcess
from itemloaders.processors import TakeFirst
import pandas as pd
from collections import defaultdict
class CvItem(scrapy.Item):
category = Field(output_processor = TakeFirst())
salary = Field(output_processor = TakeFirst())
title = Field(output_processor = TakeFirst())
organisation = Field(output_processor = TakeFirst())
class CvSpider(scrapy.Spider):
name = 'cv'
start_urls = {'Accountancy_finance': ['https://www.cv-library.co.uk/Degree-Finance-jobs?us=1',
'https://www.cv-library.co.uk/Degree-Accounting-jobs?us=1'],
'Aeronautical_Engineering': ['https://www.cv-library.co.uk/Degree-Aeronautical-Engineering-jobs?us=1'],
'Manufacturing_Engineering': ['https://www.cv-library.co.uk/Degree-Manufacturing-Engineering-jobs?us=1'],
'Agriculture_and_Forestry': ['https://www.cv-library.co.uk/Degree-Forestry-jobs?us=1']}
def start_requests(self):
for items, urls in self.start_urls.items():
for url in urls:
yield scrapy.Request(
url = url,
callback = self.parse,
cb_kwargs = {
'items':items
}
)
def parse(self, response, items):
container = response.xpath('//ol[#id="searchResults"]')
for lists in container:
loader = ItemLoader(CvItem(), selector = lists)
loader.add_value('category', items)
loader.add_xpath('title', '//article[#id]//a[#title]/#title')
loader.add_xpath('salary', '//article[#id]//dl//dd[#class="job__details-value salary"]//text()')
loader.add_xpath('organisation', '//article[#id]/div//div/p/a//text()')
yield loader.load_item()
There was a slight mistake with the requests that I updated for those of you that had checked for the first 15minutes since I uploaded it.
The problem was in the container's xpath. You only get the container without actually the items in it so you only loop once on the container itself and not the actual items you want to scrape.
from scrapy.item import Field
import scrapy
from scrapy.loader import ItemLoader
from scrapy.crawler import CrawlerProcess
from itemloaders.processors import TakeFirst
import pandas as pd
from collections import defaultdict
class CvItem(scrapy.Item):
category = Field(output_processor=TakeFirst())
salary = Field(output_processor=TakeFirst())
title = Field(output_processor=TakeFirst())
organisation = Field(output_processor=TakeFirst())
class CvSpider(scrapy.Spider):
name = 'cv'
start_urls = {'Accountancy_finance': ['https://www.cv-library.co.uk/Degree-Finance-jobs?us=1',
'https://www.cv-library.co.uk/Degree-Accounting-jobs?us=1'],
'Aeronautical_Engineering': ['https://www.cv-library.co.uk/Degree-Aeronautical-Engineering-jobs?us=1'],
'Manufacturing_Engineering': ['https://www.cv-library.co.uk/Degree-Manufacturing-Engineering-jobs?us=1'],
'Agriculture_and_Forestry': ['https://www.cv-library.co.uk/Degree-Forestry-jobs?us=1']}
def start_requests(self):
for items, urls in self.start_urls.items():
for url in urls:
yield scrapy.Request(
url=url,
cb_kwargs={
'items': items
}
)
def parse(self, response, items):
container = response.xpath('//ol[#id="searchResults"]//li[#class="results__item"]')
for lists in container:
loader = ItemLoader(CvItem(), selector=lists)
loader.add_value('category', items)
loader.add_xpath('title', '//article[#id]//a[#title]/#title')
loader.add_xpath('salary', '//article[#id]//dl//dd[#class="job__details-value salary"]//text()')
loader.add_xpath('organisation', '//article[#id]/div//div/p/a//text()')
yield loader.load_item()
I create a scrapy project to scrape a few information off this classifieds website, however the data I was getting needed to be formatted. After doing some research I figured out how to implement an ItemLoader but now it does not write any scraped data to the csv file.
Here's my spider.py:
import scrapy
from..items import TestItem
from scrapy.loader import ItemLoader
class TestSpiderSpider(scrapy.Spider):
name = 'test'
page_number = 2
start_urls = ['https://jamaicaclassifiedonline.com/auto/cars/']
def parse(self, response):
for car in response.css('.col.l3.s12.m6'):
items = TestItem()
product_title = car.css('.jco-card-title::text').extract()
product_imagelink = car.css('.card-image img::attr(data-src)').getall()
urls = car.css('.card-image a::attr(href)').getall()
for url in urls:
url = response.urljoin(url)
yield scrapy.Request(url=url, callback=self.parse_details)
if product_title and product_imagelink:
items['urls'] = urls
def parse_details(self, response):
l= ItemLoader(item=TestItem(), selector=response)
l.add_css('product_title','#title::text')
yield l.load_item()
pass
Here's my items.py
import scrapy
from scrapy.loader.processors import MapCompose, TakeFirst
from w3lib.html import remove_tags
class TestItem(scrapy.Item):
product_title = scrapy.Field(input_processors= MapCompose(remove_tags),output_processor= TakeFirst())
pass
Here's my setting.py:
BOT_NAME = 'test'
SPIDER_MODULES = ['test.spiders']
NEWSPIDER_MODULE = 'test.spiders'
ROBOTSTXT_OBEY = True
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_user_agents.middlewares.RandomUserAgentMiddleware': 400,
}
Here's my pipeline.py:
class TestPipeline:
def process_item(self, item, spider):
return item
You don't need pipelines enabled to use ItemLoader, try without.
Hey so I have some experience scraping html but never json and so I need to scrape the following web page using scrapy, http://www.starcitygames.com/buylist/search?search-type=category&id=5061, and I found a tutorial online that uses scrapy along with jmspath to scrape json data from the web. And I got the tutorial to work but I am trying to alter it to work with my website to no luck. No errors but it does not return any data. Any help would be greatly appreciated!
items.py
import scrapy
class NameItem(scrapy.Item):
"""User item definition for jsonplaceholder /LoginSpider endpoint."""
name = scrapy.Field()
condition = scrapy.Field()
price = scrapy.Field()
rarity = scrapy.Field()
LoginSpider.py
import scrapy
import json
from scrapy.spiders import Spider
from scrapy_splash import SplashRequest
from ..items import NameItem
from scrapy.loader import ItemLoader
from scrapy.loader.processors import Join, MapCompose, SelectJmes
class UserSpider(scrapy.Spider):
"""Spider to scrape `http://www.starcitygames.com/buylist/search?search-type=category&id=5061`."""
name = 'LoginSpider'
allowed_domains = ['http://www.starcitygames.com/buylist/search?search-type=category&id=5061']
start_urls = ['http://www.starcitygames.com/buylist/search?search-type=category&id=5061']
# dictionary to map UserItem fields to Jmes query paths
jmes_paths = {
'name': 'name',
'condition': 'condition',
'price': 'price',
'rarity': 'rarity',
}
def parse(self, response):
jsonresponse = json.loads(response.body_as_unicode())
for user in jsonresponse:
loader = ItemLoader(item=NameItem()) # create an ItemLoader to populate a NameItem
loader.default_input_processor = MapCompose(str) # apply str conversion on each value
loader.default_output_processor = Join(' ')
for (field, path) in self.jmes_paths.items():
loader.add_value(field, SelectJmes(path)(user))
yield loader.load_item()
The response of this url http://www.starcitygames.com/buylist/search?search-type=category&id=5061has 3 levels:
'Ok'
'search'
'results' ## this contain the data
And results key has multiple values what you should iterate.
Inside the values are the data.
Try this code, I hope you can help.
This is the module items.py
class SoResponseItem(scrapy.Item):
name = scrapy.Field()
condition = scrapy.Field()
price = scrapy.Field()
rarity = scrapy.Field()
This is the spider
import scrapy
import json
from SO_response.items import SoResponseItem
class LoginspiderSpider(scrapy.Spider):
name = 'LoginSpider'
allowed_domains = ['www.starcitygames.com']
url = 'http://www.starcitygames.com/'
def start_requests(self):
yield scrapy.Request(url=self.url, callback=self.parse)
def parse(self, response):
url = response.urljoin('buylist/search?search-type=category&id=5061')
yield scrapy.Request(url=url, callback=self.parse_data)
def parse_data(self, response):
jsonreponse = json.loads(response.body)
for result in jsonreponse['results']:
for index in range(len(result)):
items = SoResponseItem()
items['name'] = result[index]['name']
items['condition'] = result[index]['condition']
items['price'] = result[index]['price']
items['rarity'] = result[index]['rarity']
yield items
Try in your shell:
scrapy crawl -o jmes.json