I have written some python code with scrapy to extract some addresses from a website.
The first part of the code is putting together the start_urls by reading the latitude and longitude coordinates from a separate file googlecoords.txt which then form part of the start_urls. (The googlecoords.txt file I prepared previously converts UK postcodes in google coordinates for googlemaps).
So, for example, the first item in the start_url list is "https://www.howdens.com/process/searchLocationsNear.php?lat=53.674434&lon=-1.4908923&distance=1000&units=MILES" where "lat=53.674434&lon=-1.4908923" have come from the googlecoors.txt file.
However, when I run the code it works perfectly except that it prints out the googlecoords.txt file first - which I don't need.
How do I stop this print happening? (Though I can live with it.)
import scrapy
import sys
from scrapy.http import FormRequest, Request
from Howdens.items import HowdensItem
class howdensSpider(scrapy.Spider):
name = "howdens"
allowed_domains = ["www.howdens.com"]
# read the file that has a list of google coordinates that are converted from postcodes
with open("googlecoords.txt") as f:
googlecoords = [x.strip('\n') for x in f.readlines()]
# from the goole coordinates build the start URLs
start_urls = []
for a in range(len(googlecoords)):
start_urls.append("https://www.howdens.com/process/searchLocationsNear.php?{}&distance=1000&units=MILES".format(googlecoords[a]))
# cycle through 6 of the first relevant items returned in the text
def parse(self, response):
for sel in response.xpath('/html/body'):
for i in range(0,6):
try:
item = HowdensItem()
item['name'] =sel.xpath('.//text()').re(r'(?<="name":")(.*?)(?=","street")')[i]
item['street'] =sel.xpath('.//text()').re(r'(?<="street":")(.*?)(?=","town")')[i]
item['town'] = sel.xpath('.//text()').re(r'(?<="town":")(.*?)(?=","pc")')[i]
item['pc'] = sel.xpath('.//text()').re(r'(?<="pc":")(.*?)(?=","state")')[i]
yield item
except IndexError:
pass
Like someone in the comments pointed out you should load it up with json module in start_requests() method:
import scrapy
import json
class MySpider(scrapy.Spider):
start_urls = ['https://www.howdens.com/process/searchLocationsNear.php?lat=53.674434&lon=-1.4908923&distance=1000&units=MILES']
def parse(self, response):
data = json.loads(response.body_as_unicode())
items = data['response']['depots']
for item in items:
url_template = "https://www.howdens.com/process/searchLocationsNear.php?{}&distance=1000&units=MILES"
url = url_template.format(item['lat']) # format in your location here
yield scrapy.Request(url, self.parse_item)
def parse_item(self, response):
print(response.url)
Related
I am outputting the URL of the first page of the order results page of an exhibitor extracted from a specific EC site to a csv file, reading it in start_requests, and looping through it with a for statement.
Each order result page contains information on 30 products.
https://www.buyma.com/buyer/2597809/sales_1.html
itempage
Specify the links for the 30 items on each order results page and list? type, and I tried to retrieve them one by one and store them in the item as shown in the code below, but it does not work.
class AllSaledataSpider(CrawlSpider):
name = 'all_salesdata_copy2'
allowed_domains = ['www.buyma.com']
def start_requests(self):
with open('/Users/morni/researchtool/AllshoppersURL.csv', 'r', encoding='utf-8-sig') as f:
reader = csv.reader(f)
for row in reader:
for n in range(1, 300):
url =str((row[2])[:-5]+'/sales_'+str(n)+'.html')
yield scrapy.Request(
url=url,
callback=self.parse_firstpage_item,
dont_filter=True
)
def parse_firstpage_item(self, response):
loader = ItemLoader(item = ResearchtoolItem(), response = response)
Conversion_date = response.xpath('//*[#id="buyeritemtable"]/div/ul/li[2]/p[3]/text()').getall()
product_name = response.xpath('//*[#id="buyeritemtable"]/div/ul/li[2]/p[1]/a/text()').getall()
product_URL = response.xpath('//*[#id="buyeritemtable"]/div/ul/li[2]/p[1]/a/#href').getall()
for i in range(30):
loader.add_value("Conversion_date", Conversion_date[i])
loader.add_value("product_name", product_name[i])
loader.add_value("product_URL", product_URL[i])
yield loader.load_item()
Specify the links for the 30 items on each order results page and list? type, and I tried to retrieve them one by one and store them in the item as shown in the code below, but it does not work.
The output is as follows, where each item contains multiple items of information at once.
Current status:
{"product_name": ["product1", "product2"]), "Conversion_date":["Conversion_date1", "Conversion_date2" ], "product_URL":["product_URL1", "product_URL2"]},
Ideal:
[{"product_name": "product1", "Conversion_date": Conversion_date1", "product_URL": "product_URL1"},{"product_name": "product2", "Conversion_date": Conversion_date2", "product_URL": "product_URL2"}]
This may be due to my lack of understanding of basic for statements and yield.
You need to create a new loader each iteration
for i in range(30):
loader = ItemLoader(item = ResearchtoolItem(), response = response)
loader.add_value("Conversion_date", Conversion_date[i])
loader.add_value("product_name", product_name[i])
loader.add_value("product_URL", product_URL[i])
yield loader.load_item()
EDIT:
add_value appends a value to the list. Since you had zero elements in the list, then after you append you'll have a list with one element.
In order to get the values as a string you can use a processor. Example:
import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst
class ProductItem(scrapy.Item):
name = scrapy.Field(output_processor=TakeFirst())
price = scrapy.Field(output_processor=TakeFirst())
class ExampleSpider(scrapy.Spider):
name = 'exampleSpider'
start_urls = ['https://scrapingclub.com/exercise/list_infinite_scroll/']
def parse(self, response, **kwargs):
names = response.xpath('//div[#class="card-body"]//h4/a/text()').getall()
prices = response.xpath('//div[#class="card-body"]//h5//text()').getall()
length = len(names)
for i in range(length):
loader = ItemLoader(item=ProductItem(), response=response)
loader.add_value('name', names[i])
loader.add_value('price', prices[i])
yield loader.load_item()
Here's the code I'll be working with (I'm using scrapy)
def start_requests(self):
start_urls = ['https://www.lowes.com/search?searchTerm=8654RM-42']
This is where I'm storing all my URLS
Here is how I'm trying to only print everything after the '='
productSKU = response.url.split("=")[-1]
item["productSKU"] = productSKU
Here is the output:
{'productPrice': '1,449.95',
'productSKU': 'https://www.lowes.com/pd/ZLINE-KITCHEN-BATH-Ducted-Red-Matte-Wall-Mounted-Range-Hood-Common-42-Inch-Actual-42-in/1001440644'}
So now here's the problem:
The URLs I'm inputting will eventually be populated with
https://www.lowes.com/search?searchTerm = {something}
and that's why I would like to use {something} to ensure I'll have every item that I attempted to scrape on the CSV (for sorting and matching purposes).
The URL I'm using redirects to me this URL:
(Input)https://www.lowes.com/search?searchTerm=8654RM-42
->
(Redirect) https://www.lowes.com/pd/ZLINE-KITCHEN-BATH-Ducted-Red-Matte-Wall-Mounted-Range-Hood-Common-42-Inch-Actual-42-in/1001440644
And so, my output for productSKU is the entire redirect URL instead of just whatever is after the '=' sign. The output I would like would be 8654RM-42.
And here is my whole program
# -*- coding: utf-8 -*-
import scrapy
from ..items import LowesspiderItem
from scrapy.http import Request
class LowesSpider(scrapy.Spider):
name = 'lowes'
def start_requests(self):
start_urls = ['https://www.lowes.com/search?searchTerm=8654RM-42']
for url in start_urls:
yield Request(url, cookies={'sn':'2333'}) #Added cookie to bypass location req
def parse(self, response):
items = response.css('.grid-container')
for product in items:
item = LowesspiderItem()
#get product price
productPrice = product.css('.art-pd-price::text').get()
productSKU = response.url.split("=")[-1]
item["productSKU"] = productSKU
item["productPrice"] = productPrice
yield item
you need to use meta to pass in the input url like this
def start_requests(self):
start_urls = ['https://www.lowes.com/search?searchTerm=8654RM-42']
for url in start_urls:
yield Request(url, cookies={'sn':'2333'},meta={'url':url)
def parse(self,response):
url = response.meta['url'] #your input url
Before my scrapy used to do the following:
In each link of the top 250 from imdb it would open and would get the info that I needed.
Right now I have a csv file with 500 links and I need it to open one by one and get the information I seek.
However I'm a bit lost and have no idea how to do it.
I was thinking of changing the
def parse(self,response)
But I'm not sure how
This is my previous code:
import scrapy
from imdb2.items import Imdb2Item
class ThirdSpider(scrapy.Spider):
name = "imdbtestspider"
allowed_domains = ["imdb.com"]
start_urls = (
'http://www.imdb.com/chart/top',
)
def parse(self, response):
links = response.xpath('//tbody[#class="lister-list"]/tr/td[#class="titleColumn"]/a/#href').extract()
i =1
for link in links:
abs_url = response.urljoin(link)
#
url_next = '//*[#id="main"]/div/span/div/div/div[2]/table/tbody/tr['+str(i)+']/td[3]/strong/text()'
rating = response.xpath(url_next).extract()
if (i <= len(links)):
i=i+1
yield scrapy.Request(abs_url, callback = self.parse_indetail, meta={'rating' : rating})
def parse_indetail(self,response):
item = Imdb2Item()
#
item['title'] = response.xpath('//div[#class="title_wrapper"]/h1/text()').extract()[0][:-1]
item['production'] = response.xpath('//h4[contains(text(), "Production Co")]/following-sibling::a/text()').extract()
return item
And my code is like this right now:
import scrapy
from imdb2.items import Imdb2Item
import csv
import re
from scrapy.contrib.linkextractors import LinkExtractor
class ThirdSpider(scrapy.Spider):
name = "imdbtestspider"
allowed_domains = []
with open('links.csv') as f:
start_urls = [url.strip() for url in f.readlines()]
def parse(self, response):
#this should change i guess?
def parse_indetail(self,response):
item = Imdb2Item()
#
item['title'] = response.xpath('//div[#class="title_wrapper"]/h1/text()').extract()[0][:-1]
item['production'] = response.xpath('//h4[contains(text(), "Production Co")]/following-sibling::a/text()').extract()
return item
I added to get my links from csv file, but i dont know what to change on def parse.
Thank you.
Do you have films links in your csv file? In this case your code will look like:
import scrapy
from imdb2.items import Imdb2Item
import csv
class ThirdSpider(scrapy.Spider):
name = "imdbtestspider"
def start_requests(self):
with open('links.csv', 'r') as f:
for url in f.readlines():
yield Request(url.strip())
def parse(self, response):
item = Imdb2Item()
item['title'] = response.xpath('//div[#class="title_wrapper"]/h1/text()').extract()[0][:-1]
item['production'] = response.xpath('//h4[contains(text(), "Production Co")]/following-sibling::a/text()').extract()
yield item
I am trying to scrape TripAdvisor's reviews, but I cannot find the Xpath to have it dynamically go through all the pages. I tried yield and callback but the thing is I cannot find the xpath for the line that goes to the next page. I am talking about This site
Here Is my code(UPDATED):
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from scrapingtest.items import ScrapingTestingItem
class scrapingtestspider(Spider):
name = "scrapytesting"
allowed_domains = ["tripadvisor.in"]
base_uri = "tripadvisor.in"
start_urls = [
"http://www.tripadvisor.in/Hotel_Review-g297679-d300955-Reviews-Ooty_Fern_Hill_A_Sterling_Holidays_Resort-Ooty_Tamil_Nadu.html"]
output_json_dict = {}
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//a[contains(text(), "Next")]/#href').extract()
items = []
i=0
for sites in sites:
item = ScrapingTestingItem()
#item['reviews'] = sel.xpath('//p[#class="partial_entry"]/text()').extract()
item['subjects'] = sel.xpath('//span[#class="noQuotes"]/text()').extract()
item['stars'] = sel.xpath('//*[#class="rate sprite-rating_s rating_s"]/img/#alt').extract()
item['names'] = sel.xpath('//*[#class="username mo"]/span/text()').extract()
items.append(item)
i+=1
sites = sel.xpath('//a[contains(text(), "Next")]/#href').extract()
if(sites and len(sites) > 0):
yield Request(url="tripadvisor.in" + sites[i], callback=self.parse)
else:
yield items
If you want to select the URL behind Next why don't you try something like this:
next_url = response.xpath('//a[contains(text(), "Next")]/#href).extract()
And then yield a Request with this URL? With this you get always the next site to scrape and do not need the line containing the numbers.
Recently I did something similar on tripadvisor and this approach worked for me. If this won't work for you update your code with the approach you are trying to see where it can be approved.
Update
And change your Request creation block to the following:
if(sites and len(sites) > 0):
for site in sites:
yield Request(url="http://tripadvisor.in" + site, callback=self.parse)
Remove the else part and yield items at the end of the loop when the method finished with every parsing.
I think it can only work if you make a list of urls you want to scrap in a .txt file.
class scrapingtestspider(Spider):
name = "scrapytesting"
allowed_domains = ["tripadvisor.in"]
base_uri = "tripadvisor.in"
f = open("urls.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()
I have already created one spider that collects a list of company names with matching phone numbers. This is then saved to a CSV file.
I am then wanting to scrape data from another site using the phones numbers in the CSV file as POST data. I am wanting it to loop through the same start URL but just scraping the data that each phone number produces until there are no more numbers left in the CSV file.
This is what I have got so far:
from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.http import FormRequest
from scrapy.selector import HtmlXPathSelector
from scrapy import log
import sys
from scrapy.shell import inspect_response
from btw.items import BtwItem
import csv
class BtwSpider(BaseSpider):
name = "btw"
allowed_domains = ["siteToScrape.com"]
start_urls = ["http://www.siteToScrape.com/broadband/broadband_checker"]
def parse(self, response):
phoneNumbers = ['01253873647','01253776535','01142726749']
return [FormRequest.from_response(response,formdata={'broadband_checker[phone]': phoneNumbers[1]},callback=self.after_post)]
def after_post(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#id="results"]')
items = []
for site in sites:
item = BtwItem()
fttcText = site.select("div[#class='content']/div[#id='btfttc']/ul/li/text()").extract()
# Now we will change the text to be a boolean value
if fttcText[0].count('not') > 0:
fttcEnabled=0
else:
fttcEnabled=1
item['fttcAvailable'] = fttcEnabled
items.append(item)
return items
At the minute I have just been trying to get this looping through a list(phoneNumbers) but I have not even managed to get that to work so far. Once I know how to do that I will be able to get it to pull it from a CSV file by myself. In its current state it is just using the phoneNumber with a index of 1 in the list.
Assuming you have a phones.csv file with phones in it:
01253873647
01253776535
01142726749
Here's your spider:
import csv
from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.http import FormRequest
from scrapy.selector import HtmlXPathSelector
class BtwItem(Item):
fttcAvailable = Field()
phoneNumber = Field()
class BtwSpider(BaseSpider):
name = "btw"
allowed_domains = ["samknows.com"]
def start_requests(self):
yield Request("http://www.samknows.com/broadband/broadband_checker", self.parse_main_page)
def parse_main_page(self, response):
with open('phones.csv', 'r') as f:
reader = csv.reader(f)
for row in reader:
phone_number = row[0]
yield FormRequest.from_response(response,
formdata={'broadband_checker[phone]': phone_number},
callback=self.after_post,
meta={'phone_number': phone_number})
def after_post(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#id="results"]')
phone_number = response.meta['phone_number']
for site in sites:
item = BtwItem()
fttc = site.select("div[#class='content']/div[#id='btfttc']/ul/li/text()").extract()
item['phoneNumber'] = phone_number
item['fttcAvailable'] = 'not' in fttc[0]
yield item
Here's what was scraped after running it:
{'fttcAvailable': False, 'phoneNumber': '01253873647'}
{'fttcAvailable': False, 'phoneNumber': '01253776535'}
{'fttcAvailable': True, 'phoneNumber': '01142726749'}
The idea is to scrape the main page using start_requests, then read the csv file line-by-line in the callback and yield new Requests for each phone number (csv row). Additionally, pass phone_number to the callback through the meta dictionary in order to write it to the Item field (I think you need this to distinguish items/results).
Hope that helps.