I'm trying to crawl the website of a prominent UK retailer and get an attributeError as follows:
nl_env/lib/python3.6/site-packages/scrapy/spiders/sitemap.py", line 52, in _parse_sitemap
for r, c in self._cbs:
AttributeError: 'NlSMCrawlerSpider' object has no attribute '_cbs'
It's probably me not fully conceiving how a SitemapSpider operates - see my code below:
class NlSMCrawlerSpider(SitemapSpider):
name = 'nl_smcrawler'
allowed_domains = ['newlook.com']
sitemap_urls = ['http://www.newlook.com/uk/sitemap/maps/sitemap_uk_product_en_1.xml']
sitemap_follow = ['/uk/womens/clothing/']
# sitemap_rules = [
# ('/uk/womens/clothing/', 'parse_product'),
# ]
def __init__(self):
self.driver = webdriver.Safari()
self.driver.set_window_size(800,600)
time.sleep(2)
def parse_product(self, response):
driver = self.driver
driver.get(response.url)
time.sleep(1)
# Collect products
itemDetails = driver.find_elements_by_class_name('product-details-page content')
# Pull features
desc = itemDetails[0].find_element_by_class_name('product-description__name').text
href = driver.current_url
# Generate a product identifier
identifier = href.split('/p/')[1].split('?comp')[0]
identifier = int(identifier)
# datetime
dt = date.today()
dt = dt.isoformat()
# Price Symbol removal and integer conversion
try:
priceString = itemDetails[0].find_element_by_class_name('price product-description__price').text
except:
priceString = itemDetails[0].find_element_by_class_name('price--previous-price product-description__price--previous-price ng-scope').text
priceInt = priceString.split('£')[1]
originalPrice = float(priceInt)
# discountedPrice Logic
try:
discountedPriceString = itemDetails[0].find_element_by_class_name('price price--marked-down product-description__price').text
discountedPriceInt = discountedPriceString.split('£')[1]
discountedPrice = float(discountedPriceInt)
except:
discountedPrice = 'N/A'
# NlScrapeItem
item = NlScrapeItem()
# Append product to NlScrapeItem
item['identifier'] = identifier
item['href'] = href
item['description'] = desc
item['originalPrice'] = originalPrice
item['discountedPrice'] = discountedPrice
item['firstSighted'] = dt
item['lastSighted'] = dt
yield item
Also, don't hesitate to ask for any further details, see the link to the sitemap and a link to the actual file within the Scrapy package throwing off the error (link - github). Your help would be sincerely appreciated.
Edit: One Thought
looking at the 2nd link (from the Scrapy package), I can see _cbs is initialised in the def __init__(self, *a, **kw): function - is the fact that I have my own init logic throwing it off?
Two issue are there in your scraper. One is the __init__ method
def __init__(self):
self.driver = webdriver.Safari()
self.driver.set_window_size(800, 600)
time.sleep(2)
Now you have defined a new __init__ and overridden the base class __init__. Which is not called by your init and hence the _cbs is not initialized. You can easily fix this by changing your init method as below
def __init__(self, *a, **kw):
super(NlSMCrawlerSpider, self).__init__(*a, **kw)
self.driver = webdriver.Safari()
self.driver.set_window_size(800, 600)
time.sleep(2)
Next the SitemapScraper will always send response to the parse method. And you have not defined the parse method at all. So I added a simple one to just print the urls
def parse(self, response):
print(response.url)
Related
I am trying to parse a public forum that contains multiple threads. I need to store metadata of that thread. These metadata appear before getting inside the thread i.e in the page which displays the list of discussion threads.
In my scrapy code below, I need to access values from parse() method in parse_contents() method. I am storing those values in class variables but the parse_contents() picks up the first value that was assigned the very first time although the new value has been assigned before calling parse_contents().
Here is my spider class
import scrapy
import re
import pandas as pd
import time
from functools import reduce
from ..items import PostsItem
class SpiderSpider(scrapy.Spider):
name = 'posts'
page_count = 1
forum_count = 0
#Create an item container to store all this data
post_item = PostsItem()
# I want these variables to parse_contents() method
post_subject_last_message_date = ""
total_posts = 0
start_urls = [
# 'https://www.dcurbanmom.com/jforum/posts/list/150/946237.page'
'https://www.dcurbanmom.com/jforum/forums/show/32.page'
]
# Grabs the list of threads in the DCPS forum
def parse(self, response):
for next_forum in response.xpath('//span[#class="topictitle"]'):
next_forum_link = next_forum.xpath('.//a/#href')
next_forum_url = response.urljoin(next_forum_link.extract_first())
last_message = next_forum.xpath('.//ancestor::td[1]/following-sibling::td[4]/span/text()')
self.post_subject_last_message_date = last_message.get() #This needs to be picked up by parse_contents
yield scrapy.Request(url = next_forum_url, callback=self.parse_contents)
#Get next page of duscussion threads list
#Some code here
#Parses individual discussion thread
def parse_contents(self, response):
all_posts = response.xpath('//table[#class="forumline"]//tr')
post_text = ""
for post in all_posts:
post_text_response = post.xpath(".//div[#class='postbody']/br/following-sibling::text()[1] | .//div[#class='postbody']/br/following-sibling::a[1]/text() | .//div[#class='postbody']/text() | .//div[#class='postbody']/a/text()")
if(len(post_text_response.getall())>0):
post_text = "".join(re.sub('\r','',x) for x in post_text_response.getall()).strip()
#Populate the item container
if(bool(re.search(r'^\s*$', post_text))==False):
self.post_item['post_message'] = post_text
# !!! This is not picking up the value updated in the parse method !!!
self.post_item['post_subject_last_message_date'] = self.post_subject_last_message_date
post_text = ""
yield(self.post_item)
# Go to next page in this discussion thread
# Some code here
How can I fix this?
Edit: removed some lines of code to make it easier to read
replacing yield scrapy.Request(url = next_forum_url, callback=self.parse_contents) with the following fixed it for me
yield scrapy.Request(url = next_forum_url, callback=self.parse_contents, cb_kwargs = {
'post_subject_answers': post_subject_answer,
'post_subject_first_post_date':post_subject_first_post_date,
'post_subject_views':post_subject_views,
'post_subject_last_message_date':post_subject_last_message_date
})
So I'm making a couple of scrapers and now I'm trying to make a script that runs the corresponding spiders with URLs collected from a DB but I can't find a way to do this.
I have this in my spider:
class ElCorteIngles(scrapy.Spider):
name = 'ElCorteIngles'
url = ''
DEBUG = False
def start_requests(self):
if self.url != '':
yield scrapy.Request(url=self.url, callback=self.parse)
def parse(self, response):
# Get product name
try:
self.p_name = response.xpath('//*[#id="product-info"]/h2[1]/a/text()').get()
except:
print(f'{CERROR} Problem while getting product name from website - {self.name}')
# Get product price
try:
self.price_no_cent = response.xpath('//*[#id="price-container"]/div/span[2]/text()').get()
self.cent = response.xpath('//*[#id="price-container"]/div/span[2]/span[1]/text()').get()
self.currency = response.xpath('//*[#id="price-container"]/div/span[2]/span[2]/text()').get()
if self.currency == None:
self.currency = response.xpath('//*[#id="price-container"]/div/span[2]/span[1]/text()').get()
self.cent = None
except:
print(f'{CERROR} Problem while getting product price from website - {self.name}')
# Join self.price_no_cent with self.cent
try:
if self.cent != None:
self.price = str(self.price_no_cent) + str(self.cent)
self.price = self.price.replace(',', '.')
else:
self.price = self.price_no_cent
except:
print(f'{ERROR} Problem while joining price with cents - {self.name}')
# Return data
if self.DEBUG == True:
print([self.p_name, self.price, self.currency])
data_collected = ShopScrapersItems()
data_collected['url'] = response.url
data_collected['p_name'] = self.p_name
data_collected['price'] = self.price
data_collected['currency'] = self.currency
yield data_collected
Normally when I run the spider from the console I do:
scrapy crawl ElCorteIngles -a url='https://www.elcorteingles.pt/electrodomesticos/A26601428-depiladora-braun-senso-smart-5-5500/'
and now I need a way to do the same on a external script and get the output yield data_collected
What I currently have in my external script is this:
import scrapy
from scrapy.crawler import CrawlerProcess
import sqlalchemy as db
# Import internal libraries
from Ruby.Ruby.spiders import *
# Variables
engine = db.create_engine('mysql+pymysql://DATABASE_INFO')
class Worker(object):
def __init__(self):
self.crawler = CrawlerProcess({})
def scrape_new_links(self):
conn = engine.connect()
# Get all new links from DB and scrape them
query = 'SELECT * FROM Ruby.New_links'
result = conn.execute(query)
for x in result:
telegram_id = x[1]
email = x[2]
phone_number = x[3]
url = x[4]
spider = x[5]
# In this cade the spider will be ElCorteIngles and
# the url https://www.elcorteingles.pt/electrodomesticos/A26601428-depiladora-
# braun-senso-smart-5-5500/'
self.crawler.crawl(spider, url=url)
self.crawler.start()
Worker().scrape_new_links()
I also don't know if doing url=url in self.crawler.crawl() is the proper way to give the URL to the spider but let me know what you think.
All data from yield is being returned by a pipeline.
I think there is no need for extra info but if you need any just let me know!
Scrapy works asynchronously...ignore my imports but this is a JSON api I made for scrapy. You need to make a custom runner with an item_scraped signal. There was originally a klein endpoint and when the spider finished it would return a JSON list. I think this is what you want but without the klein endpoint so I've taken it out. My spider was GshopSpider I replaced it with your spiders name.
By taking advantage of deferred we are able to use callbacks and send signals each time an item is scraped. So using this code we collect each item into a list with a signal and when the spider finishes we have a callback setup to return_spider_output
# server.py
import json
from scrapy import signals
from scrapy.crawler import CrawlerRunner
from googleshop.spiders.gshop import GshopSpider
from scrapy.utils.project import get_project_settings
class MyCrawlerRunner(CrawlerRunner):
def crawl(self, crawler_or_spidercls, *args, **kwargs):
# keep all items scraped
self.items = []
crawler = self.create_crawler(crawler_or_spidercls)
crawler.signals.connect(self.item_scraped, signals.item_scraped)
dfd = self._crawl(crawler, *args, **kwargs)
dfd.addCallback(self.return_items)
return dfd
def item_scraped(self, item, response, spider):
self.items.append(item)
def return_items(self, result):
return self.items
def return_spider_output(output):
return json.dumps([dict(item) for item in output])
if __name__=="__main__"
settings = get_project_settings()
runner = MyCrawlerRunner(settings)
spider = ElCorteIngles()
deferred = runner.crawl(spider)
deferred.addCallback(return_spider_output)
return deferred
The easiest way to do this would be something like this:
class ElCorteIngles(scrapy.Spider):
name = 'ElCorteIngles'
url = ''
DEBUG = False
def __init__(self):
super().__init__(self, **kwargs)
# Establish your db connection here. This can be any database connection.
# Reuse this connection object anywhere else
self.conn = conn = engine.connect()
def start_requests(self):
with self.conn.cursor() as cursor:
cursor.execute('''SELECT * FROM Ruby.New_links WHERE url NOT NULL OR url != %s''', ('',))
result = cursor.fetchall()
for url in result:
yield scrapy.Request(url=url, dont_filter=True, callback=self.parse)
def parse(self):
# Your Parse code here
After Doing this you can initiate this crawler using something like this
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from project_name.spiders.filename import ElCorteIngles
process = CrawlerProcess(get_project_settings())
process.crawl(ElCorteIngles)
process.start()
Hope this helps.
I would also recommend you to have a queue if you are working with a large number of URLs. This will enable multiple spider processes to work on these URLs in parallel. You can initiate the queue in the init method.
The problem is that I've been iterating from a list of places to scrape the latitude longitude and elevation. The thing is when I get what I scraped back I have no way to link it with my current df since the names that I iterated may have either been modified or skipped.
I've managed to get the name of what I looked but since its parsed from an outside the link from the rest of the items it doesn't work properly.
import scrapy
import pandas as pd
from ..items import latlonglocItem
df = pd.read_csv('wine_df_final.csv')
df = df[pd.notnull(df.real_place)]
real_place = list(set(df.real_place))
class latlonglocSpider(scrapy.Spider):
name = 'latlonglocs'
start_urls = []
for place in real_place:
baseurl = place.replace(',', '').replace(' ', '+')
cleaned_href = f'http://www.google.com/search?q={baseurl}+coordinates+latitude+longitude+distancesto'
start_urls.append(cleaned_href)
def parse(self, response):
items = latlonglocItem()
items['base_name'] = response.xpath('string(/html/head/title)').get().split(' coordinates')[0]
for href in response.xpath('//*[#id="ires"]/ol/div/h3/a/#href').getall():
if href.startswith('/url?q=https://www.distancesto'):
yield response.follow(href, self.parse_distancesto)
else:
pass
yield items
def parse_distancesto(self, response):
items = latlonglocItem()
try:
items['appellation'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[2]/p/strong)').get()
items['latitude'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[1]/td)').get()
items['longitude'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[2]/td)').get()
items['elevation'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[10]/td)').get()
yield items
except Exception:
pass
#output
appellation base_name elevation latitude longitude
Chalone, USA
Santa Cruz, USA 56.81 35 9.23
what is happening is that I parse what I looked for then it goes inside a link and parses the rest of the information. However, evidently on my dataframe I get the name of what I looked for completely unattached with the rest of the items and even then is hard to find the match. I wish to pass the info to the other function so it yields all the items all together.
This may work. I will comment both what I am doing and a little bit of your code you have an understanding of what I am doing.
import scrapy
import pandas as pd
from ..items import latlonglocItem
df = pd.read_csv('wine_df_final.csv')
df = df[pd.notnull(df.real_place)]
real_place = list(set(df.real_place))
class latlonglocSpider(scrapy.Spider): # latlonglocSpider is a child class of scrapy.Spider
name = 'latlonglocs'
start_urls = []
for place in real_place:
baseurl = place.replace(',', '').replace(' ', '+')
cleaned_href = f'http://www.google.com/search?q={baseurl}+coordinates+latitude+longitude+distancesto'
start_urls.append(cleaned_href)
def __init__(self): # Constructor for our class
# Since we did our own constructor we need to call the parents constructor
scrapy.Spider.__init__(self)
self.base_name = None # Here is the base_name we can now use class wide
def parse(self, response):
items = latlonglocItem()
items['base_name'] = response.xpath('string(/html/head/title)').get().split(' coordinates')[0]
self.base_name = items['base_name'] # Lets store the base_name in the class
for href in response.xpath('//*[#id="ires"]/ol/div/h3/a/#href').getall():
if href.startswith('/url?q=https://www.distancesto'):
yield response.follow(href, self.parse_distancesto)
else:
pass
yield items
def parse_distancesto(self, response):
items = latlonglocItem()
try:
# If for some reason self.base_name is never assigned in
# parse() then we want to use an empty string instead of the self.base_name
# The following syntax means use self.base_name unless it is None or empty
# in which case just use and empty string.
base_name = self.base_name or "" # If for some reason
items['appellation'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[2]/p/strong)').get()
items['latitude'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[1]/td)').get()
items['longitude'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[2]/td)').get()
items['elevation'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[10]/td)').get()
yield items
except Exception:
pass
import scrapy
import pandas as pd
from ..items import latlonglocItem
df = pd.read_csv('wine_df_final.csv')
df = df[pd.notnull(df.real_place)]
real_place = list(set(df.real_place))
class latlonglocSpider(scrapy.Spider): # latlonglocSpider is a child class of scrapy.Spider
name = 'latlonglocs'
start_urls = []
for place in real_place:
baseurl = place.replace(',', '').replace(' ', '+')
cleaned_href = f'http://www.google.com/search?q={baseurl}+coordinates+latitude+longitude+distancesto'
start_urls.append(cleaned_href)
def __init__(self): # Constructor for our class
# Since we did our own constructor we need to call the parents constructor
scrapy.Spider.__init__(self)
self.base_name = None # Here is the base_name we can now use class wide
def parse(self, response):
for href in response.xpath('//*[#id="ires"]/ol/div/h3/a/#href').getall():
if href.startswith('/url?q=https://www.distancesto'):
self.base_name = response.xpath('string(/html/head/title)').get().split(' coordinates')[0]
yield response.follow(href, self.parse_distancesto)
else:
pass
def parse_distancesto(self, response):
items = latlonglocItem()
try:
# If for some reason self.base_name is never assigned in
# parse() then we want to use an empty string instead of the self.base_name
# The following syntax means use self.base_name unless it is None or empty
# in which case just use and empty string.
items['base_name'] = self.base_name or "" # If for some reason
items['appellation'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[2]/p/strong)').get()
items['latitude'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[1]/td)').get()
items['longitude'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[2]/td)').get()
items['elevation'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[10]/td)').get()
yield items
except Exception:
pass
thanks to Error - Syntactical Remorse. Concurrent requests had to set to 1 for it to work and placed base_name inside the loop.
After writing some code in python, I've got stuck in deep trouble. I'm a newbie in writing code following the OOP design in python. The xpaths I've used in my code are flawless. I'm getting lost when it comes to run the "passing_links" method in my "info_grabber" class through the instance of "page_crawler" class. Every time I run my code I get an error "'page_crawler' object has no attribute 'passing_links'". Perhaps the way I've written my class-crawler is not how it should be. However, as I've spent few hours on it so I suppose I might get any suggestion as to which lines I should rectify to make it work. Thanks in advance for taking a look into it:
from lxml import html
import requests
class page_crawler(object):
main_link = "https://www.yellowpages.com/search?search_terms=pizza&geo_location_terms=San%20Francisco%2C%20CA"
base_link = "https://www.yellowpages.com"
def __init__(self):
self.links = [self.main_link]
def crawler(self):
for link in self.links:
self.get_link(link)
def get_link(self, link):
print("Running page "+ link)
page = requests.get(link)
tree = html.fromstring(page.text)
item_links = tree.xpath('//h2[#class="n"]/a[#class="business-name"][not(#itemprop="name")]/#href')
for item_link in item_links:
return self.base_link + item_link
links = tree.xpath('//div[#class="pagination"]//li/a/#href')
for url in links:
if not self.base_link + url in self.links:
self.links += [self.base_link + url]
class Info_grabber(page_crawler):
def __init__(self, plinks):
page_crawler.__init__(self)
self.plinks = [plinks]
def passing_links(self):
for nlink in self.plinks:
print(nlink)
self.crawling_deep(nlink)
def crawling_deep(self, uurl):
page = requests.get(uurl)
tree = html.fromstring(page.text)
name = tree.findtext('.//div[#class="sales-info"]/h1')
phone = tree.findtext('.//p[#class="phone"]')
try:
email = tree.xpath('//div[#class="business-card-footer"]/a[#class="email-business"]/#href')[0]
except IndexError:
email=""
print(name, phone, email)
if __name__ == '__main__':
crawl = Info_grabber(page_crawler)
crawl.crawler()
crawl.passing_links()
Now upon execution I get a new error "raise MissingSchema(error)" when it hits the line "self.crawling_deep(nlink)"
I'm not sure i understand what you're trying to do in page_crawler.get_link, but i think you should have a different method for collecting "pagination" links.
I renamed Info_grabber.plinks to Info_grabber.links so that the page_crawler.crawler can access them, and managed to extract info from several pages, however the code is far from ideal.
class page_crawler(object):
main_link = "https://www.yellowpages.com/search?search_terms=pizza&geo_location_terms=San%20Francisco%2C%20CA"
base_link = "https://www.yellowpages.com"
def __init__(self):
self.links = []
self.pages = []
def crawler(self):
for link in self.links:
self.get_link(link)
def get_link(self, link):
print("Running page "+ link)
page = requests.get(link)
tree = html.fromstring(page.text)
item_links = tree.xpath('//h2[#class="n"]/a[#class="business-name"][not(#itemprop="name")]/#href')
for item_link in item_links:
if not self.base_link + item_link in self.links:
self.links += [self.base_link + item_link]
def get_pages(self, link):
page = requests.get(link)
tree = html.fromstring(page.text)
links = tree.xpath('//div[#class="pagination"]//li/a/#href')
for url in links:
if not self.base_link + url in self.pages:
self.pages += [self.base_link + url]
class Info_grabber(page_crawler):
def __init__(self, plinks):
page_crawler.__init__(self)
self.links += [plinks]
def passing_links(self):
for nlink in self.links:
print(nlink)
self.crawling_deep(nlink)
def crawling_deep(self, uurl):
page = requests.get(uurl)
tree = html.fromstring(page.text)
name = tree.findtext('.//div[#class="sales-info"]/h1')
phone = tree.findtext('.//p[#class="phone"]')
try:
email = tree.xpath('//div[#class="business-card-footer"]/a[#class="email-business"]/#href')[0]
except IndexError:
email=""
print(name, phone, email)
if __name__ == '__main__':
url = page_crawler.main_link
crawl = Info_grabber(url)
crawl.crawler()
crawl.passing_links()
You'll notice that i added a pages property and a get_pages method in page_crawler, i'll leave the implementation part to you.
You might need to add more methods to page_crawler later on, as they could be of use if you develop more child classes. Finally consider looking into composition as it is also a strong OOP feature.
Your crawl is an instance of the page crawler class, but not the InfoGrabber class, which is the class that has the method passing_links. I think what you want to do is make crawl an instance of InfoGrabber instead.
Then I believe before doing self.crawling_deep you must do:
if n_link:
page = requests.get(n_link).text
tel = re.findall(r'\d{10}', page)[0] if re.findall(r'\d{10}', page) else ""
print(tel)
I have a main page to crawl name and url.Again need to go to that url and crawl further details like fullname,age,and link. Finally need to return the items with (name,url,age,sex,link) in a single item.
Want to define first level of crawl in one method crawl_page and the second level of crawl in another method crawl_item.
class CrawlLink(CrawlSpider):
name = "crawllink"
allowed_domains = ['www.xyz.org']
start_urls = ["www.xyz.org/profile?page=0"]
rules = [Rule(SgmlLinkExtractor(allow = ('/profile\?page=\d+'),restrict_xpaths = ('//li[#class="pager-next"]',),canonicalize=False ),
callback = 'parse_page',
follow=True)
]
def parse_page(self, response):
self.log ('Started Crawling List %s' %response.url)
items = response.xpath("//div[#id='profile']/div")
ulists = []
for temp in items:
usritem = PostUsers()
usrlink = temp.xpath("./div[#class='name']/a/#href").extract()[0]
usritem ["url"] = 'www.xyz.org'+usrlink
usritem ["namel"] = temp.xpath("//div[#id='user_profile_main']/dl/dd[1]/text()").extract()
for urltemp in usrlink:
yield Request(url=usritem["url"], callback=self.parse_user)
# ulists.append( usritem)
return ulists
def parse_user(self, response):
self.log ('Started Crawling Profile %s' %response.url)
usr = PostUsers()
relative_url = response.xpath("//div[#id='nav-content']/ul/li[2]/a/#href").extract()[0]
usr["link"] = 'www.xyz.org'+relative_url
usr ["age"] = response.xpath("//div[#id='user_user_full_group_profile_main']/dl/dd[1]/text()").extract()
usr ["fullname"] = response.xpath("//h1[#id='page-title']/text()").extract()
self.log ('Finished Crawling Profile %s' %response.url)
return usr