I am working Google search crawling using scrapy. This is the code and it works well to get search results.
GoogleBot.py:
class GoogleBotsSpider(scrapy.Spider):
name = 'GoogleScrapyBot'
allowed_domains = ['google.com']
start_urls = ['https://www.google.com/search?q=apple&hl=en&rlz=&start=0']
def parse(self, response):
item = {}
all_page = response.xpath('//*[#id="main"]')
for page in all_page:
title = page.xpath('//*[#id="main"]/div/div/div/a/h3/div/text()').extract()
link = page.xpath('//*[#id="main"]/div/div/div/a/#href').extract()
for title, link in zip(title, link):
print(title)
print(link.lstrip("/url?q="))
My next step is use "pipeline" on Scrapy to save a csv file for results.
Here is the code that I have written so far.
setting.py:
ITEM_PIPELINES = {'GoogleScrapy.pipelines.GooglePipeline': 300,}
pipelines.py:
from scrapy.exporters import CsvItemExporter
class GooglePipeline(object):
def __init__(self):
self.file = open("GoogleSearchResult.csv", 'wb')
self.exporter = CsvItemExporter(self.file, encoding='utf-8')
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
This is modified my spider code.
GoogleBot.py:
def parse(self, response):
item = {}
all_page = response.xpath('//*[#id="main"]')
for page in all_page:
item['title'] = page.xpath('//*[#id="main"]/div/div/div/a/h3/div/text()').extract()
item['link'] = page.xpath('//*[#id="main"]/div/div/div/a/#href').extract()
for title, link in zip(title, link):
print(title)
print(link.lstrip("/url?q="))
yield item
It has error where in:
for title, link in zip(title, link):
print(title)
print(link.lstrip("/url?q="))
I get this error:
for title, link in zip(title, link):
UnboundLocalError: local variable 'title' referenced before assignment
Here is the working output according to your comment.
import scrapy
class GoogleBotsSpider(scrapy.Spider):
name = 'GoogleScrapyBot'
allowed_domains = ['google.com']
start_urls = ['https://www.google.com/search?q=apple&hl=en&rlz=&start=0']
def parse(self, response):
all_page = response.xpath('//*[#id="main"]')
for page in all_page:
titles = page.xpath('//*[#id="main"]/div/div/div/a/h3/div//text()').extract()
for title in titles:
links = page.xpath('//*[#id="main"]/div/div/div/a/#href').extract()
for link in links:
item={
'Title': title,
'Link':link
}
yield item
Related
I'm using python 3.6 and scrapy 2.4.1, and I wrote a spider to scrape about 5 pages, then use xlsxwriter to save to excel, however this scarpy only get last page data, can't figure out why, here is my spider code
import scrapy
from scrapy.selector import Selector
from ebay.items import EbayItem
class EbaySpiderSpider(scrapy.Spider):
name = 'ebay_spider'
allowed_domains = ['www.ebay.com.au']
start_urls = ['https://www.ebay.com.au/sch/auplazaplace/m.html?_nkw=&_armrs=1']
def parse(self, response):
item_price_extract = []
item_title = []
item_title_list = response.xpath('//h3[#class="lvtitle"]/a')
item_href = response.xpath('//h3[#class="lvtitle"]/a/#href').getall()
for title in item_title_list:
item_title_text = title.xpath('string(.)').get()
item_title.append(item_title_text)
item_price = response.xpath('//li[#class="lvprice prc"]//span[#class="bold"]')
for i in range(len(item_price)):
item_price_text = item_price[i].xpath('string(.)').get()
item_price_extract.append(item_price_text.strip())
item_info = EbayItem(title=item_title, price=item_price_extract, item_href=item_href)
yield item_info
next_url_href = response.xpath('//a[#class="gspr next"]/#href').get()
if not next_url_href:
return
else:
yield scrapy.Request(next_url_href, callback=self.parse)
and pipeline code
import xlsxwriter
class EbayPipeline:
def open_spider(self, spider):
pass
def process_item(self, item, spider):
col_num = 0
workbook = xlsxwriter.Workbook(r'C:\Users\Clevo\Desktop\store_spider.xlsx')
worksheet = workbook.add_worksheet()
item_source = dict(item)
# print(item_source)
for key, values in item_source.items():
worksheet.write(0, col_num, key)
worksheet.write_column(1, col_num, values)
col_num += 1
workbook.close()
return item
someone know the reason why? it seems everything is ok, but I can only get last page data
by the way, is there anyway to transfer data to another function? I want to scrapy page detail and transfer the data to process_item function and yield them together
Better scraped every pages first and get data on its product page.
class EbaySpiderSpider(scrapy.Spider):
name = "ebay_spider"
def start_requests(self):
base_url = 'https://www.ebay.com.au/sch/auplazaplace/m.html?_nkw=&_armrs='
for i in range(1,6):
page = base_url + str(i)#i will be the page number and add to base_url
yield scrapy.Request(url=page , callback=self.parse)
# scraped all product links first and yield to parse_contents
def parse(self, response):
links = response.xpath('//h3[#class="lvtitle"]/a/#href').extract()
for link in links:
yield scrapy.Request(url=link, callback=self.parse_contents)
#scraped desired data on product page
def parse_contents(self, response):
product_url = response.url
title = response.xpath('//h1/text()').extract()[0]
price = response.xpath('//span[#itemprop="price"]/text()').extract()[0]
item = EbayItem()
item['product_title'] = title
item['product_price'] = price
yield item ### to items.py
items.py, make sure that the item keys are equal to scrapy.Field()
class EbayITem(scrapy.Item):
product_title = scrapy.Field()
product_price = scrapy.Field()
pipelines.py
import xlsxwriter
class EbayPipeline:
def process_item(self, item, spider):
title = item['product_title']
price = item['product_price']
#process your worksheet here
Working version of your code
import scrapy
from scrapy.selector import Selector
from ebay.items import EbayItem
class EbaySpiderSpider(scrapy.Spider):
name = 'ebay_spider'
allowed_domains = ['ebay.com.au']
start_urls = ['https://www.ebay.com.au/sch/auplazaplace/m.html?_nkw=&_armrs=1']
def parse(self, response):
item_price_extract = []
item_title = []
item_title_list = response.xpath('//h3[#class="lvtitle"]/a')
item_href = response.xpath('//h3[#class="lvtitle"]/a/#href').getall()
for title in item_title_list:
item_title_text = title.xpath('string(.)').get()
item_title.append(item_title_text)
item_price = response.xpath('//li[#class="lvprice prc"]//span[#class="bold"]')
for i in range(len(item_price)):
item_price_text = item_price[i].xpath('string(.)').get()
item_price_extract.append(item_price_text.strip())
item_info = EbayItem(title=item_title, price=item_price_extract, item_href=item_href)
yield item_info
next_url_href = response.xpath('//a[#class="gspr next"]/#href').get()
if next_url_href is not None:
next_url_href = response.urljoin(next_url_href)
yield scrapy.Request(next_url_href, callback=self.parse)
You will have to set ROBOTSTXT_OBEY=False in settings.py (which is not a good practice) or else it your spider won't scrape data and will give message:
[scrapy.downloadermiddlewares.robotstxt] DEBUG: Forbidden by robots.txt: <GET https://www.ebay.com.au/sch/auplazaplace/m.html?_nkw=&_armrs=1>
I want to follow the links that I've scraped to get more details.
For example, from here, which contains all the job titles.
I wish to go to one of the links, for example, here to extract the job descriptions.
Below is my working code for getting the Title Link and Date as well as getting them to insert into a CSV File.
class MySpider(BaseSpider):
name = "craigslist"
allowed_domains = ["singapore.craigslist.com.sg"]
start_urls = ["https://singapore.craigslist.com.sg/d/jobs/search/jjj"]
def parse(self, response):
item = SampleItem()
item["title"] = response.xpath('//*[#class="result-info"]/a/text()').extract()
item["link"] = response.xpath('//*[#class="result-info"]/a/#href').extract()
item["date"] = response.xpath('//*[#class="result-info"]/time[#class="result-date"]/text()').extract()
for i in range(len(item["title"])):
yield {"Title": item['title'][i], "Link": item['link'][i], "Date": item['date'][i]}
This is my attempt to go to the link but it hasn't been successful.
class MySpider(BaseSpider):
name = "craigslist"
allowed_domains = ["singapore.craigslist.com.sg"]
start_urls = ["https://singapore.craigslist.com.sg/d/jobs/search/jjj"]
BASE_URL = 'https://singapore.craigslist.com.sg'
def parse(self, response):
links = response.xpath('//*[#class="result-info"]/a/#href').extract()
item = SampleItem()
item["title"] = response.xpath('//*[#class="result-info"]/a/text()').extract()
item["date"] = response.xpath('//*[#class="result-info"]/time[#class="result-date"]/text()').extract()
for i in range(len(item["title"])):
yield {"Title": item['title'][i], "Date": item['date'][i]}
for link in links:
absolute_url = self.BASE_URL + link
yield BaseSpider.Request(absolute_url, callback=self.parse_attr)
def parse_attr(self, response):
item = SampleItem()
item["description"] = response.xpath('//*[#id="postingbody"]/text()').extract()
for i in range(len(item["description"])):
yield {"Description" : item["description"]}
Any idea how to do this?
Log of scraper
I'm trying to pass a value from a function.
i looked up the docs and just didn't understand it.
ref:
def parse_page1(self, response):
item = MyItem()
item['main_url'] = response.url
request = scrapy.Request("http://www.example.com/some_page.html",
callback=self.parse_page2)
request.meta['item'] = item
yield request
def parse_page2(self, response):
item = response.meta['item']
item['other_url'] = response.url
yield item
here is a psudo code of what i want to achive:
import scrapy
class GotoSpider(scrapy.Spider):
name = 'goto'
allowed_domains = ['first.com', 'second.com]
start_urls = ['http://first.com/']
def parse(self, response):
name = response.xpath(...)
price = scrapy.Request(second.com, callback = self.parse_check)
yield(name, price)
def parse_check(self, response):
price = response.xpath(...)
return price
This is how you can pass any value, link etc to other methods:
import scrapy
class GotoSpider(scrapy.Spider):
name = 'goto'
allowed_domains = ['first.com', 'second.com']
start_urls = ['http://first.com/']
def parse(self, response):
name = response.xpath(...)
link = response.xpath(...) # link for second.com where you may find the price
request = scrapy.Request(url=link, callback = self.parse_check)
request.meta['name'] = name
yield request
def parse_check(self, response):
name = response.meta['name']
price = response.xpath(...)
yield {"name":name,"price":price} #Assuming that in your "items.py" the fields are declared as name, price
So let's say I want to write a spider that using the Facebook API to calculate the likes on every page of a website. If I import the requests library, I'm able to call the Facebook graph API as follows.
import scrapy
import json
import requests
API_KEY="KEY_GOES_HERE"
class WebSite(scrapy.Spider):
name = "website_page"
allowed_domains = ["website.com"]
start_urls = ['https://website.com/']
def get_likes(self,url):
base='https://graph.facebook.com/{}?access_token={}'.format(url,API_KEY)
data=requests.get(base)
return self.parse_likes(data)
def parse_likes(self, data):
data = json.loads(data.text)
return data['id'],data['share']['comment_count'],data['share']['share_count']
def parse(self, response):
item= {}
item['url'] = response.url
links = response.css('a::attr(href)').extract()
item['fb_url'],item['shares'],item['comments'] = self.get_likes(response.url)
for link in links:
link = response.urljoin(link)
item['link'] = link
yield scrapy.Request(link, callback=self.parse)
yield item
However, I can't seem to get this code to work if, rather than using the requests, I use the scrapy.Request call. Something like this.
import scrapy
import json
import requests
API_KEY="KEY_GOES_HERE"
class WebSite(scrapy.Spider):
name = "website_page"
allowed_domains = ["website.com"]
start_urls = ['https://website.com/']
def get_likes(self,url):
base='https://graph.facebook.com/{}?access_token={}'.format(url,API_KEY)
return scrapy.Request(base,callback=self.parse_likes)
def parse_likes(self, data):
data = json.loads(data.text)
return data['id'],data['share']['comment_count'],data['share']['share_count']
def parse(self, response):
item= {}
links = response.css('a::attr(href)').extract()
item['url'] = response.url
item['fb_data']=self.get_likes(response.url).body
for link in links:
link = response.urljoin(link)
item['link'] = link
yield scrapy.Request(link, callback=self.parse)
yield item
In this case, I just get a blank response for the Facebook data. I think i'm missing some understanding about how the scrapy.Request method works relative to the standard requests library. Any ideas?
This is a very common case: How to yield from item from multiple urls?
And the most common solution is to chain requests by carrying your item in request.meta paramater.
For your example implementation with this logic could look like:
class WebSite(scrapy.Spider):
base='https://graph.facebook.com/{}?access_token={}'.format
api_key = '1234'
def parse(self, response):
links = response.css('a::attr(href)').extract()
for link in links:
item= {}
item['url'] = response.url
item['fb_data']=self.get_likes(response.url).body
item['link'] = response.urljoin(link)
api_url = self.base(self.api_key, link)
yield scrapy.Request(api_url,
callback=self.parse_likes,
meta={'item': item})
def parse_likes(self, response):
item = response.meta['item']
data = json.loads(data.text)
share_count = data['id'],data['share']['comment_count'],data['share']['share_count']
item['share_count'] = share_count
yield item
I have written a scrapy crawler but I need to add the ability to read some arguments from the command line and then populates some static fields in my spider class. I also need to override the initialiser so it populates some of the spider fields.
import scrapy
from scrapy.spiders import Spider
from scrapy.http import Request
import re
class TutsplusItem(scrapy.Item):
title = scrapy.Field()
class MySpider(Spider):
name = "tutsplus"
allowed_domains = ["bbc.com"]
start_urls = ["http://www.bbc.com/"]
def parse(self, response):
links = response.xpath('//a/#href').extract()
# We stored already crawled links in this list
crawledLinks = []
for link in links:
# If it is a proper link and is not checked yet, yield it to the Spider
# if linkPattern.match(link) and not link in crawledLinks:
if not link in crawledLinks:
link = "http://www.bbc.com" + link
crawledLinks.append(link)
yield Request(link, self.parse)
titles = response.xpath('//a[contains(#class, "media__link")]/text()').extract()
for title in titles:
item = TutsplusItem()
item["title"] = title
print("Title is : %s" % title)
yield item
Then it should be run as:
scrapy runspider crawler.py arg1 arg2
How do I achieve this?
You can do that by overriding the init method of your spider like this.
class MySpider(Spider):
name = "tutsplus"
allowed_domains = ["bbc.com"]
start_urls = ["http://www.bbc.com/"]
arg1 = None
arg2 = None
def __init__(self, arg1, arg2, *args, **kwargs):
self.arg1 = arg1
self.arg2 = arg2
super(MySpider, self).__init__(*args, **kwargs)
def parse(self, response):
links = response.xpath('//a/#href').extract()
# We stored already crawled links in this list
crawledLinks = []
for link in links:
# If it is a proper link and is not checked yet, yield it to the Spider
# if linkPattern.match(link) and not link in crawledLinks:
if not link in crawledLinks:
link = "http://www.bbc.com" + link
crawledLinks.append(link)
yield Request(link, self.parse)
titles = response.xpath('//a[contains(#class, "media__link")]/text()').extract()
for title in titles:
item = TutsplusItem()
item["title"] = title
print("Title is : %s" % title)
yield item
Then run your spider like
scrapy crawl tutsplus -a arg1=arg1 -a arg2=arg2