I have a simple spider that crawls local obituaries. The code works perfectly until I try to add two static columns. All I want to do is add the date I pulled the information (pull item) and the state in which it was pulled (state item). It's a self loading page so when I add the pull date, I only get the first 10 results (or only the first page). If I add just the state, I only get two results. When I remove both, I get all 40+ results.
I did # lines that aren't working properly:
Item.py file:
import scrapy
class AlItem(scrapy.Item):
name = scrapy.Field()
link = scrapy.Field()
obit = scrapy.Field()
news = scrapy.Field()
#pull = scrapy.Field()
#state = scrapy.Field()
spider file:
import scrapy
import time
from al.items import AlItem
class AlabamaSpider(scrapy.Spider):
name = 'alabama'
allowed_domains = ['legacy.com']
start_urls = ['http://www.legacy.com/obituaries/annistonstar/browse?type=paid&page=20']
def parse(self, response):
name = response.xpath('//a[#class="NonMobile"]/p[#class="obitName"]/text()').extract()
link = response.xpath('//div[#class="RightColumn"]//a[#class="ObituaryButton"]/#href').extract()
obit = response.xpath('//div[#class="NameAndLocation"]/p[#class="obitText"]/text()').extract()
news = response.xpath('//div[#class="PublishedLine publishedLine"]/span/text()').extract()
#pull = time.strftime("%m/%d/%Y")
#state = "AL"
for item in zip(name, link, obit, news): #removed 'pull, state'
new_item = AlItem()
new_item['name'] = item[0]
new_item['link'] = item[1]
new_item['obit'] = item[2]
new_item['news'] = item[3]
#new_item['pull'] = pull
#new_item["state"] = state
yield new_item
I explain why:
if you paste in here for item in zip(name, link, obit, news): pull & state, then you will get the number of iterations equal 2 because state = "AL" - string variable. ZIP function get from state two chars and set iteration = 2 for all arguments in loop. zip gets the smallest numb from arguments for iteration. as with the date 01/01/2001 - 10 chars. (will iterations equal 10)
WILL WORKING:
`class AlItem(scrapy.Item):
name = scrapy.Field()
link = scrapy.Field()
obit = scrapy.Field()
news = scrapy.Field()
pull = scrapy.Field()
state = scrapy.Field()`
class AlabamaSpider(scrapy.Spider):
name = 'alabama'
allowed_domains = ['legacy.com']
start_urls = ['http://www.legacy.com/obituaries/annistonstar/browsetype=paid&page=20']
def parse(self, response):
name = response.xpath('//a[#class="NonMobile"]/p[#class="obitName"]/text()').extract()
link = response.xpath('//div[#class="RightColumn"]//a[#class="ObituaryButton"]/#href').extract()
obit = response.xpath('//div[#class="NameAndLocation"]/p[#class="obitText"]/text()').extract()
news = response.xpath('//div[#class="PublishedLine publishedLine"]/span/text()').extract()
pull = time.strftime("%m/%d/%Y")
state = "AL"
for item in zip(name, link, obit, news): #removed 'pull, state'
new_item = AlItem()
new_item['name'] = item[0]
new_item['link'] = item[1]
new_item['obit'] = item[2]
new_item['news'] = item[3]
new_item['pull'] = pull
new_item["state"] = state
yield new_item
Related
import scrapy
from ..items import AmazondawinItem
class AmazonspiderSpider(scrapy.Spider):
name = 'amazon'
pagenumber = 3
allowed_domains = ['amazon.com']
start_urls = [
'https://www.amazon.com/s?k=laptop&i=computers&crid=27GFGJVF4KNRP&sprefix=%2Ccomputers%2C725&ref=nb_sb_ss_recent_1_0_recent'
]
def parse(self, response):
items = AmazondawinItem()
name = response.css('.a-size-medium::text').extract()
try:
old_price = response.css('.a-spacing-top-micro .a-text-price span::text').extract()
except:
old_price = None
price = response.css('.a-spacing-top-micro .a-price-whole::text').extract()
try:
review = response.css('.s-link-style .s-underline-text::text').extract()
except:
review = None
imagelink = response.css('.s-image::attr(src)').extract()
items['name'] = name
items['old_price'] = old_price
items['price'] = price
items['review'] = review
items['imagelink'] = imagelink
# description =
# ram =
# brand =
# cpu_model =
yield items
Here when I go to scrapy to convert my web scraping data to csv file or any file! No matter how many rows I have. In just one row, the data of all rows is being inserted. or import. Suppose, I have 200 rows in 1 column. But I am getting 200 rows of data in one row.
It's because you're yielding all the items instead of yielding each item separately.
A not so nice solution:
import scrapy
# from ..items import AmazondawinItem
class AmazonspiderSpider(scrapy.Spider):
name = 'amazon'
pagenumber = 3
allowed_domains = ['amazon.com']
start_urls = [
'https://www.amazon.com/s?k=laptop&i=computers&crid=27GFGJVF4KNRP&sprefix=%2Ccomputers%2C725&ref=nb_sb_ss_recent_1_0_recent'
]
def parse(self, response):
# items = AmazondawinItem()
name = response.css('.a-size-medium::text').extract()
try:
old_price = response.css('.a-spacing-top-micro .a-text-price span::text').extract()
except:
old_price = None
price = response.css('.a-spacing-top-micro .a-price-whole::text').extract()
try:
review = response.css('.s-link-style .s-underline-text::text').extract()
except:
review = None
imagelink = response.css('.s-image::attr(src)').extract()
# items = dict()
# items['name'] = name
# items['old_price'] = old_price
# items['price'] = price
# items['review'] = review
# items['imagelink'] = imagelink
items = dict()
for (items['name'], items['old_price'], items['price'], items['review'], items['imagelink']) in zip(name, old_price, price, review, imagelink):
yield items
# description =
# ram =
# brand =
# cpu_model =
# yield items
A better solution:
Remove the try except, get() function will return none if no value was found. It's better not to use it in spiders anyway.
Get the items one by one.
Just replace the dict part with your item, just make sure it's inside the loop.
import scrapy
# from ..items import AmazondawinItem
class AmazonspiderSpider(scrapy.Spider):
name = 'amazon'
pagenumber = 3
allowed_domains = ['amazon.com']
start_urls = [
'https://www.amazon.com/s?k=laptop&i=computers&crid=27GFGJVF4KNRP&sprefix=%2Ccomputers%2C725&ref=nb_sb_ss_recent_1_0_recent'
]
def parse(self, response):
for row in response.css('div.s-result-list div.s-result-item.s-asin'):
# items = AmazondawinItem()
items = dict()
items['name'] = row.css('.a-size-medium::text').get()
items['old_price'] = row.css('.a-spacing-top-micro .a-text-price span::text').get()
items['price'] = response.css('.a-spacing-top-micro .a-price-whole::text').get()
items['review'] = row.css('.s-link-style .s-underline-text::text').get()
items['imagelink'] = row.css('.s-image::attr(src)').get()
yield items
# description =
# ram =
# brand =
# cpu_model =
# yield items
I am new to scrapy and this is my first try in web scraping. Structure of the webpage fro which I am trying to scrape is following:
level 0: Main company URL ---> level 1: several associated company URLs ----> level 2: each associated company URL in level 1 has many URLs linked ---> ... upto level n
Right now I can scrape data upto level 1. But I want to do it upto n th level recursively. There should be a control like max_depth upto which I want to scrape.
I can not figure out how to do it.
Here is my spider which I wrote so far:
import scrapy
from ..items import *
class NodeSpider(scrapy.Spider):
name = 'nodes'
start_urls = ['https://www.zaubacorp.com/companysearchresults/DOIT-']
base_url = 'https://www.zaubacorp.com/'
custom_settings = {
'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter',
}
def parse(self, response):
search_links = response.xpath('//table[#id="results"]/tr/td/a[contains(#href,"company/DOIT-URBAN")]/#href').getall()
page_list = search_links[1:]
#url = search_links.pop(0)
check_list = []
for url in search_links:
print("func 1")
yield response.follow(url=url, callback=self.parse_doit,meta={'page_list':page_list,
'check_list':check_list
})
def parse_doit(self, response):
print("func 2")
check_list = response.meta['check_list']
lnk = MainLink()
lnk['name'] = response.xpath('//*[#id="block-system-main"]/div[2]/div[1]/div[1]/div[1]/table/tbody/tr[1]/td[2]/p//text()').get()
lnk['url'] = response.url
lnk['address'] = response.xpath('//*[#id="block-system-main"]/div[2]/div[1]/div[6]//p[4]/text()').get()
lnk['email'] = response.xpath('//*[#id="block-system-main"]/div[2]/div[1]/div[6]//p[1]/text()').get()
lnk['director1'] = response.xpath('//*[#id="package1"]/td[2]//p//text()').get()
lnk['director2'] = response.xpath('//*[#id="package2"]/td[2]//p//text()').get()
dir1_same_co_list = response.xpath('//*[#id="accordion1"]/table[1]//td//p/a/#href').getall()
dir2_same_co_list = response.xpath('//*[#id="accordion2"]/table[1]//td//p/a/#href').getall()
co_list = dir1_same_co_list + list(set(dir2_same_co_list)-set(dir1_same_co_list))
dir_same_co_list = list(set(co_list)-set(check_list))
check_list = check_list + list(set(dir_same_co_list)-set(check_list))
page_list = response.meta['page_list']
if dir1_same_co_list:
next_url_dir = dir_same_co_list.pop(0)
yield response.follow(url = next_url_dir, callback = self.parse_level_2,
meta = {'name':lnk,
'url':lnk,
'address':lnk,
'email':lnk,
'director1':lnk,
'director2':lnk,
'dir_same_co_list':dir_same_co_list,
'page_list':page_list
})
def parse_level_2(self,response):
print("func 3")
lnk = response.meta['name']
lnk = response.meta['url']
lnk = response.meta['address']
lnk = response.meta['email']
lnk = response.meta['director1']
lnk = response.meta['director2']
page_list = response.meta['page_list']
#next_page = response.meta['next_page']
level_2 = SecondaryLink()
try:
lnk['Company_Details_W_Same_Directors']
except:
lnk['Company_Details_W_Same_Directors'] = []
#for sub_link in dir1_same_co_list:
level_2['Co_Name'] = response.xpath('//*[#id="block-system-main"]/div[2]/div[1]/div[1]/div[1]/table/tbody/tr[1]/td[2]/p//text()').get()
level_2['Co_url'] = response.url
level_2['Address'] = response.xpath('//*[#id="block-system-main"]/div[2]/div[1]/div[6]//p[4]/text()').get()
level_2['Email'] = response.xpath('//*[#id="block-system-main"]/div[2]/div[1]/div[6]//p[1]/text()').get()
level_2['First_Director'] = response.xpath('//*[#id="package1"]/td[2]//p//text()').get()
level_2['Second_Director'] = response.xpath('//*[#id="package2"]/td[2]//p//text()').get()
lnk['Company_Details_W_Same_Directors'].append(level_2)
dir_same_co_list = response.meta['dir_same_co_list']
print("===== start reading co list =====")
if dir_same_co_list:
next_url_dir = dir_same_co_list.pop(0)
print("co list",len(dir_same_co_list))
yield response.follow(url = next_url_dir, callback = self.parse_level_2,
meta = {'name':lnk,
'url':lnk,
'address':lnk,
'email':lnk,
'director1':lnk,
'director2':lnk,
'dir_same_co_list':dir_same_co_list,
'page_list':page_list
})
else:
if page_list:
print("next page loop")
next_page = page_list.pop(0)
next_page_url = next_page
yield response.follow(url=next_page_url, callback=self.parse_doit, meta={'name':lnk,
'url':lnk,
'address':lnk,
'email':lnk,
'director1':lnk,
'director2':lnk,
'next_page':next_page,
'page_list':page_list})
else:
yield lnk
and the items.py is following:
class MainLink(scrapy.Item):
name = scrapy.Field()
url = scrapy.Field()
address = scrapy.Field()
email = scrapy.Field()
director1 = scrapy.Field()
Company_Details_W_Same_Directors = scrapy.Field()
director2 = scrapy.Field()
pass
class SecondaryLink(scrapy.Item):
Co_Name = scrapy.Field()
Co_url = scrapy.Field()
Address = scrapy.Field()
Email = scrapy.Field()
First_Director = scrapy.Field()
Second_Director = scrapy.Field()
pass ```
Help is much appreciated
You can make use of the DEPTH_LIMIT in scrapy. Please see https://docs.scrapy.org/en/latest/topics/settings.html#depth-limit
I want to crawl the township directory of China. The website is structured in 4 levels, which are province page, city page, county page, and township page. For example, on the province page, all the provinces are listed. If we click the link of one province, then it takes us to the city page and a list of the cities in that province is displayed.
I want each of my item to be a township. It includes town_name, town_id(gbcode), and corresponding county_name, city_name, prov_name. So the spider should collect information along the way as it goes deeper into the township page. However, my current approach using for loop does not seem to work. There is no problem with prov_name. But city and county name are mostly incorrect, they are always the last city/county in the list of their corresponding page. I think the problem is that the spider does not go deep enough, only go to parse_county request at the end of the loop. But, changing depth priority in the setting does not solve the problem.
---------- Sample Result --------
town_name, year, gbcode, city, province, county
建国门街道办事处,2016,110101008000,市辖区,北京市,延庆区
东直门街道办事处,2016,110101009000,市辖区,北京市,延庆区
和平里街道办事处,2016,110101010000,市辖区,北京市,延庆区
前门街道办事处,2016,110101011000,市辖区,北京市,延庆区
崇文门外街道办事处,2016,110101012000,市辖区,北京市,延庆区
import scrapy
import re
from scrapy.spiders import Spider
from admincode.items import AdmincodeItem
class StatsSpider(Spider):
name = 'stats'
allowed_domains = ['stats.gov.cn']
start_urls = [
'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/{}/index.html'.format(year) for year in range(2009, 2010)]
def parse(self, response):
for item in self.parse_provincetr(response, response.selector.css(".provincetr")):
yield item
def get_text_href(self, td):
if not td.xpath('a'):
return td.xpath('text()').extract()[0], None
else:
return td.xpath('a/text()').extract()[0], td.xpath('a/#href').extract()[0]
def parse_provincetr(self, response, trs):
year_pattern = re.compile('(tjyqhdmhcxhfdm/)([0-9][0-9][0-9][0-9])')
year = year_pattern.search(response.url).group(2)
for td in trs.xpath('td'):
scraped = {}
scraped['year'] = year
scraped['prov_name'], href = self.get_text_href(td)
url = response.urljoin(href)
yield scrapy.Request(url, callback=self.parse_citytr,
meta={'scraped': scraped})
def parse_2td(self, response, trs, var_name, nextparse):
for tr in trs:
scraped = response.meta['scraped']
scraped[var_name], href = self.get_text_href(tr.xpath('td')[1])
if nextparse:
url = response.urljoin(href)
yield scrapy.Request(url, callback=nextparse, meta={'scraped': scraped})
else:
item = AdmincodeItem()
item['year'] = scraped['year']
item['prov_name'] = scraped['prov_name']
item['city_name'] = scraped['city_name']
item['county_name'] = scraped['county_name']
item['town_name'] = scraped['town_name']
item['gbcode'], href = self.get_text_href(
tr.xpath('td')[0])
yield item
def parse_citytr(self, response):
for city in self.parse_2td(response, response.selector.css(".citytr"), 'city_name', self.parse_countytr):
yield city
def parse_countytr(self, response):
for county in self.parse_2td(response, response.selector.css(".countytr"), 'county_name', self.parse_towntr):
yield county
def parse_towntr(self, response):
for town in self.parse_2td(response, response.selector.css(".towntr"), 'town_name', None):
yield town
I think you just made things a bit complex. This is a simple scraper, what you need to do is pass information from one page to another page using meta. Since meta is a dictionary in memory we need to make sure we create copies of the information for the items to come. For that we use copy.deepcopy. This will make sure data is not overwritten before yielding the items
Below is scraper which does that
class StatsSpider(Spider):
name = 'stats'
allowed_domains = ['stats.gov.cn']
start_urls = [
'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/{}/index.html'.format(year) for year in range(2009, 2010)]
def parse(self, response):
for item in response.css(".provincetr a"):
name = item.xpath("./text()").extract_first().strip()
link = item.xpath("./#href").extract_first().strip()
yield response.follow(link, callback=self.parse_province, meta={'item':{'province':name}})
def parse_province(self, response):
meta = response.meta['item']
for cityrow in response.css(".citytr"):
city_link = cityrow.xpath("./td[2]/a/#href").extract_first()
city_name = cityrow.xpath("./td[2]/a/text()").extract_first()
city_code = cityrow.xpath("./td[1]/a/text()").extract_first()
meta_new = deepcopy(meta)
meta_new['city_name'] = city_name
meta_new['city_code'] = city_code
yield response.follow(city_link, callback=self.parse_city, meta = {'item':meta_new})
def parse_city(self, response):
meta = response.meta['item']
for countyrow in response.css(".countytr"):
county_link = countyrow.xpath("./td[2]/a/#href").extract_first()
county_name = countyrow.xpath("./td[2]/a/text()").extract_first()
county_code = countyrow.xpath("./td[1]/a/text()").extract_first()
meta_new = deepcopy(meta)
meta_new['county_name'] = county_name
meta_new['county_code'] = county_code
yield response.follow(county_link, callback=self.parse_county, meta = {"item": meta_new})
def parse_county(self, response):
meta = response.meta['item']
for townrow in response.css(".towntr"):
town_link = townrow.xpath("./td[2]/a/#href").extract_first()
town_name = townrow.xpath("./td[2]/a/text()").extract_first()
town_code = townrow.xpath("./td[1]/a/text()").extract_first()
meta_new = deepcopy(meta)
meta_new['town_name'] = town_name
meta_new['town_code'] = town_code
yield meta_new
Still getting to grips with Scapy and been following this tutorial. Having a little trouble following however as I am getting the following: NameError: name 'DmozItem' is not defined when I run this:
import scrapy
from scrapy import Item, Field
class QuotesItems(scrapy.Item):
area_name = scrapy.Field()
room_type = scrapy.Field()
period = scrapy.Field()
duration_weekly = scrapy.Field()
guide_total = scrapy.Field()
amenities = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "not_quotes"
start_urls = [
'http://www.unitestudents.com/',
]
# Step 1
def parse(self, response):
for city in response.xpath('//select[#id="frm_homeSelect_city"]/option[not(contains(text(),"Select your city"))]/text()').extract(): # Select all cities listed in the select (exclude the "Select your city" option)
yield scrapy.Request(response.urljoin("/"+city), callback=self.parse_citypage)
# Step 2
def parse_citypage(self, response):
for url in response.xpath('//div[#class="property-header"]/h3/span/a/#href').extract(): #Select for each property the url
yield scrapy.Request(response.urljoin(url), callback=self.parse_unitpage)
# Step 3
def parse_unitpage(self, response):
for final in response.xpath('//div/div/div[#class="content__btn"]/a/#href').extract(): #Select final page for data scrape
yield scrapy.Request(response.urljoin(final), callback=self.parse_final)
#Step 4
def parse(self, response):
for sel in response.xpath('//html/body/div'):
item = DmozItem()
item['area_name'] = sel.xpath('//div/ul/li/a/span/text()').extract()
item['room_type'] = sel.xpath('//div/div/div/h1/span/text()').extract()
item['period'] = sel.xpath('/html/body/div/div/section/div/form/h4/span/text()').extract()
item['duration_weekly'] = sel.xpath('//html/body/div/div/section/div/form/div/div/em/text()').extract()
item['guide_total'] = sel.xpath('//html/body/div/div/section/div/form/div/div/p/text()').extract()
item['amenities'] = sel.xpath('//div/div/div/ul/li/p/text()').extract()
yield item
I have set up my items.py file as:
class DmozItem(Item):
area_name = Field()
room_type = Field()
period = Field()
duration_weekly = Field()
guide_total = Field()
amenities = Field()
pass
Not really sure where I am going wrong on this one?
You should import DmozItem
from YourFolderName.items import DmozItem
I'm attempting to scrape a website to get a very rough demographic of it's users (no personally identifying information or photos), but the tutorial spider from the official documentation I've modified is repeating the same line of output 4 times in a row.
A copy of the code I'm using is below:
Note that the example profile I've included in the code is a fake/spam account. In the case where it may have already been deleted, you can replace the url with any other on the site and it will work again.
import scrapy
class DateSpider(scrapy.Spider):
name = "date"
start_urls = [
'http://www.pof.com/viewprofile.aspx?profile_id=141659067',
]
def parse(self, response):
for container in response.xpath('//div[#class="user-details-wide"]'):
yield {
'Gender': response.xpath("//span[#id='gender']/text()").extract_first(),
'Age': response.xpath("//span[#id='age']/text()").extract_first(),
'State': response.xpath("//span[#id='state_id']/text()").extract_first(),
'Marital status': response.xpath("//span[#id='maritalstatus']/text()").extract_first(),
'Body': response.xpath("//span[#id='body']/text()").extract_first(),
'Height': response.xpath("//span[#id='height']/text()").extract_first(),
'Ethnicity': response.xpath("//span[#id='ethnicity']/text()").extract_first(),
'Does drugs?': response.xpath("//span[#id='drugs']/text()").extract_first(),
'Smokes?': response.xpath("//span[#id='smoke']/text()").extract_first(),
'Drinks?': response.xpath("//span[#id='drink']/text()").extract_first(),
'Has children?': response.xpath("//span[#id='haschildren']/text()").extract_first(),
'Wants children?': response.xpath("//span[#id='wantchildren']/text()").extract_first(),
'Star sign': response.xpath("//span[#id='zodiac']/text()").extract_first(),
'Education': response.xpath("//span[#id='college_id']/text()").extract_first(),
'Personality': response.xpath("//span[#id='fishtype']/text()").extract_first(),
}
Running as follows:
scrapy crawl date -o date.scv
The output I'm looking for is one row of headers followed by one line of results straight after it, not the whitespace and duplicates I'm currently getting.
You don't need to use for loop. Simply find a span element and extract all data from him.
Also, I suggest you use scrapy items it's more convenient.
One way to clean extracted data from whitespace is to use xpath function normalize-space().
import scrapy
from items import DateSpiderItem
class DateSpider(scrapy.Spider):
name = "date"
start_urls = [
'http://www.pof.com/viewprofile.aspx?profile_id=141659067',
]
def parse(self, response):
item = DateSpiderItem()
item['Gender'] = response.xpath(
"//span[#id='gender']/text()").extract_first()
item['Age'] = response.xpath(
"//span[#id='age']/text()").extract_first()
item['State'] = response.xpath(
"//span[#id='state_id']/text()").extract_first()
item['Marital_status'] = response.xpath(
"normalize-space(//span[#id='maritalstatus']/text())").extract_first()
item['Body'] = response.xpath(
"//span[#id='body']/text()").extract_first()
item['Height'] = response.xpath(
"//span[#id='height']/text()").extract_first()
item['Ethnicity'] = response.xpath(
"//span[#id='ethnicity']/text()").extract_first()
item['Does_drugs'] = response.xpath(
"normalize-space(//span[#id='drugs']/text())").extract_first()
item['Smokes'] = response.xpath(
"//span[#id='smoke']/text()").extract_first()
item['Drinks'] = response.xpath(
"normalize-space(//span[#id='drink']/text())").extract_first()
item['Has_children'] = response.xpath(
"normalize-space(//span[#id='haschildren']/text())").extract_first()
item['Wants_children'] = response.xpath(
"normalize-space(//span[#id='wantchildren']/text())").extract_first()
item['Star_sign'] = response.xpath(
"//span[#id='zodiac']/text()").extract_first()
yield item
Items file:
class DateSpiderItem(scrapy.Item):
Gender = scrapy.Field()
Age = scrapy.Field()
State = scrapy.Field()
Marital_status = scrapy.Field()
Body = scrapy.Field()
Height = scrapy.Field()
Ethnicity = scrapy.Field()
Does_drugs = scrapy.Field()
Smokes = scrapy.Field()
Drinks = scrapy.Field()
Has_children = scrapy.Field()
Wants_children = scrapy.Field()
Star_sign = scrapy.Field()
Education = scrapy.Field()
Personality = scrapy.Field()
Output: