Errors while scraping email in one website with Scrapy - python

I am trying to code an e-mail scraper, but i'm having problems.
This is my code:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from email_validator import validate_email, EmailNotValidError
import requests
import pandas as pd
lista_star = ['vitalebarberiscanonico.it']
class MailSpider(scrapy.Spider):
name = 'email'
data = []
def parse(self, response):
links = LxmlLinkExtractor(allow=()).extract_links(response)
links = [str(link.url) for link in links]
links.append(str(response.url))
for link in links:
yield scrapy.Request(url=link, callback=self.parse_link)
def parse_link(self, response):
for word in self.reject:
if word in str(response.url):
return
html_text = str(response.text)
mail_list = re.findall('\w+#\w+\.{1}\w+', html_text)
for email in mail_list:
self.data.append({'email': email, 'link': str(response.url)})
def get_info():
process = CrawlerProcess({'USER_AGENT': 'Mozilla/5.0'})
process.crawl(MailSpider, start_urls=lista_star)
process.start()
df = pd.DataFrame(MailSpider.data)
df = df.drop_duplicates(subset='email')
df = df.reset_index(drop=True)
return df
df = get_info()
I get: ERROR: Error while obtaining start requests and ValueError: Missing scheme in request url: vitalebarberiscanonico.it
So i tried:
for link in links:
parsed_url = urlparse(link)
if not parsed_url.scheme:
link = urlunparse(parsed_url._replace(scheme='http'))
elif parsed_url.scheme not in ['http', 'https']:
continue
try:
yield scrapy.Request(url=link, callback=self.parse_link)
except:
link = link.replace('http', 'https')
yield scrapy.Request(url=link, callback=self.parse_link)
But it still does not work

The problem lies in you not having a scheme in your original url. Instead of having the url parsing code that you tried. You can just change the link's string itself to a http or https:
lista_star = ['https://vitalebarberiscanonico.it/']

Related

Want to get email but they provide wrong output

I want to get email but they will provide wrong output these is page link https://zoekeenadvocaat.advocatenorde.nl/advocaten/soesterberg/mevrouw-mr-mm-strengers/11094237420
import scrapy
from scrapy.http import Request
from bs4 import BeautifulSoup
from selenium import webdriver
import time
from scrapy_selenium import SeleniumRequest
import re
class TestSpider(scrapy.Spider):
name = 'test'
page_number=1
def start_requests(self):
yield SeleniumRequest(
url = "https://zoekeenadvocaat.advocatenorde.nl/zoeken?q=&type=advocaten&limiet=10&sortering=afstand&filters%5Brechtsgebieden%5D=%5B%5D&filters%5Bspecialisatie%5D=0&filters%5Btoevoegingen%5D=0&locatie%5Badres%5D=Holland&locatie%5Bgeo%5D%5Blat%5D=52.132633&locatie%5Bgeo%5D%5Blng%5D=5.291266&locatie%5Bstraal%5D=56&locatie%5Bhash%5D=67eb2b8d0aab60ec69666532ff9527c9&weergave=lijst&pagina=1",
wait_time = 3,
screenshot = True,
callback = self.parse,
dont_filter = True
)
def parse(self, response):
books = response.xpath("//span[#class='h4 no-margin-bottom']//a//#href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
title=response.css(".title h3::text").get()
advocaten=response.css(".secondary::text").get()
detail=response.xpath("//section[#class='lawyer-info']")
for i in range(len(detail)):
if re.search("#",detail[i].get()):
d1=detail[i].xpath("//div[#class='column small-9']//a//#href").get()
print(d1)
Change your xpath that it selects the second element:
(//div[#class='column small-9'])[2]/a/#href
Example: http://xpather.com/Hhjolrh1
Alternative would be to select it directly:
//a[starts-with(#href, 'mailto')]/#href
Example: http://xpather.com/EtD8noeI
You get the phone number because it is the first element that fits 'column small-9'.
As an alternative to the answer with X-Path, here a solution without X-Path:
soup.find("span", string="E-mail").parent.find_next("div").find("a").contents[0]

Scrapy only going through first 5 links with next_page_url

My code seemingly only goes through the first 5 links that are requested and then stops when the 6th is requested. I have tried to use start_urls and next_page_url. Both only extract from the first 5 pages given.
import scrapy
from scrapy.crawler import CrawlerProcess
import time
class finvizSpider(scrapy.Spider):
global tickers
global urlcheck
urlcheck = 1
tickers = []
name = "finviz"
start_urls = ["https://finviz.com/screener.ashx?v=111&f=cap_small,geo_usa,sh_avgvol_o300,sh_opt_option,sh_short_low&ft=4&o=change"]
def parse(self, response):
tickers.append(response.xpath('//a[#class="screener-link-primary"]/text()').extract())
print(tickers)
next_page_url = "https://finviz.com/"
html = response.xpath(
'//a[#class="screener_arrow"]/#href').extract()[0]
print(html)
next_page_url += html
print(next_page_url)
if next_page_url is not None:
yield scrapy.Request(next_page_url, callback=self.parse)
def returnTickers(self):
newTickerList= []
for lists in tickers:
if lists:
for t in lists:
newTickerList.append(t)
return newTickerList
Here is the error statement:
Any help is appreciated.
EDIT:
I have updated the code, but still seem to get errors.
import scrapy
from scrapy.crawler import CrawlerProcess
import time
from bs4 import BeautifulSoup
class finvizSpider(scrapy.Spider):
global tickers
global urlcheck
urlcheck = 1
tickers = []
name = "finviz"
start_urls = [
"https://finviz.com/screener.ashx?v=111&f=cap_small,geo_usa,sh_avgvol_o300,sh_opt_option,sh_short_low&ft=4&o=-change"]
def parse(self, url):
raw_html = scrapy.Request(url)
good_html = BeautifulSoup(raw_html, 'html.parser')
first_part = "https://finviz.com/"
tickers.append([x.text for x in good_html.findAll('a', {'class': 'screener-link-primary'})])
second_part = good_html.find('a', {'class': 'screener_arrow'})['href']
# Check if there is next page
if second_part:
next_url = first_part + second_part
self.parse(next_url)
def returnTickers(self):
newTickerList= []
for lists in tickers:
if lists:
for t in lists:
newTickerList.append(t)
return newTickerList
stock_list = finvizSpider()
process = CrawlerProcess()
process.crawl(finvizSpider)
process.start()
list2 = stock_list.returnTickers()
I get the following error when this is run.
The line if next_page_url is not None: will never be None, You need to check if html is None.
The line next_page_url += html will give you an error when html is None, so first you need to check if it's None.
If html is None, then you can't do html[0], replace extract with extract_first (I used get).
Here is the fixed code:
import scrapy
from scrapy.crawler import CrawlerProcess
import time
class FinvizSpider(scrapy.Spider):
name = "finviz"
urlcheck = 1
tickers = []
start_urls = ["https://finviz.com/screener.ashx?v=111&f=cap_small,geo_usa,sh_avgvol_o300,sh_opt_option,sh_short_low&ft=4&o=change"]
def parse(self, response):
self.tickers.append(response.xpath('//a[#class="screener-link-primary"]/text()').extract())
print(self.tickers)
next_page_url = "https://finviz.com/"
html = response.xpath('//a[#class="screener_arrow"]/#href').get()
print(html)
if html is not None:
next_page_url += html
print(next_page_url)
yield scrapy.Request(next_page_url, callback=self.parse)
def returnTickers(self):
newTickerList= []
for lists in self.tickers:
if lists:
for t in lists:
newTickerList.append(t)
return newTickerList
It looks like scrapy can only callback 5 times, so instead of callingback i would recommend to iterate over a list with all the links, you can do it with BeautifulSoup and it would be very simple.
Install
pip install BeautifulSoup4
BS4 import:
from bs4 import BeautifulSoup
Rest of code:
def parse(self, url):
raw_html = scrapy.Request(url)
good_html = BeautifulSoup(raw_html, 'html.parser')
first_part = "https://finviz.com/"
tickers.append([x.text for x in good_html.findAll('a', {'class':'screener-link-primary'})])
second_part = good_html.find('a', {'class':'screener_arrow'})['href']
# Check if there is next page
if second_part:
next_url = first_part + second_part
self.parse(next_url)

Crawled (403) Error while login to glassdoor.com using scrapy in python. Need Solution?

Here is the complete code, there is error "Crawled (403)", when I run the code. If I bypass the HTTP error by using HTTPERROR_ALLOWED_CODES =[403] in setting.py, then code start working.
But I need the solution of login into the website.
import scrapy
from urllib.parse import urljoin
from scrapy.http import Request,FormRequest
class MoorSpider(scrapy.Spider):
name = 'moor'
allowed_domains = ['glassdoor.com']
start_urls = ['https://www.glassdoor.com/profile/login_input.htm']
page_number = 2
def parse(self,response):
token = response.xpath('.//*[#name="gdToken"]/#value').extract()
# print(token)
yield FormRequest('https://www.glassdoor.com/profile/ajax/loginSecureAjax.htm', formdata={'username':'likej41679#94jo.com','password':'1a2b3c4d','gdToken':token}, callback=self.startscraper)
def startscraper(self,response):
yield Request('https://www.glassdoor.com/Explore/browse-companies.htm?overall_rating_low=3.5&page=1&isHiringSurge=0&locId=1282&locType=S&locName=North%20Carolina,%20US,%20US', callback=self.startscraper1)
def startscraper1(self,response):
urls = response.css('.col-12.my-0.mt-sm.mt-sm-std.order-5 a::attr(href)').extract()
# print(next_page)
for url in urls:
url1 = urljoin('https://www.glassdoor.com/', url)
yield Request(url1, callback=self.DetailPage)
# next_page = 'https://www.glassdoor.com/Explore/browse-companies.htm?overall_rating_low=3.5&page='+str(MoorSpider.page_number)+'&isHiringSurge=0&locId=1282&locType=S&locName=North%20Carolina,%20US,%20US'
next_page = 'https://www.glassdoor.com/Explore/browse-companies.htm?overall_rating_low=3.5&page=' + str(
MoorSpider.page_number) + '&isHiringSurge=0&locId=1282&locType=S&locName=North%20Carolina,%20US,%20US'
if MoorSpider.page_number <= 2:
MoorSpider.page_number += 1
yield response.follow(next_page, callback=self.startscraper1)
def DetailPage(self,response):
Company_Website=response.css('[data-test="employer-website"]::text').get()
Company_Revenue = response.css('[data-test="employer-revenue"]::text').get()
Company_Description = response.css('span[data-test="employerDescription"]::text').get()
Company_Mission = response.css('span[data-test="employerMission"]::text').get()
yield {
'Company_Website':Company_Website,
'Company_Revenue':Company_Revenue,
'Company_Description':Company_Description,
'Company_Mission':Company_Mission,
}
Replace your existing parse method with the following one in order for it to work. Turn out that your token variable holds nothing as it is generated dynamically. You can however parse the value of gdToken out of some script tag.
def parse(self,response):
token = response.css('body').re(r"gdToken\":\"(.*?)\",")[0]
yield FormRequest('https://www.glassdoor.com/profile/ajax/loginSecureAjax.htm', formdata={'username':'likej41679#94jo.com','password':'1a2b3c4d','gdToken':token}, callback=self.startscraper)

Crawl iframe and page at the same time

I just wanted to know if it's possible to crawl a page on a website and extract data from this page and from an iframe in this page at the same time?
I'm using scrapy with python and I already know how to extract data from the iframe...
Thank you for your help!!
Thanks to your answer, I made this... But I don't know what to put instead of 'url'... Can you help me again please?
# -*- coding: utf-8 -*-
import scrapy
import re
import numbers
from fnac.items import FnacItem
from urllib.request import urlopen
# from scrapy.spiders import CrawlSpider, Rule
# from scrapy.linkextractors import LinkExtractor
from bs4 import BeautifulSoup
class Fnac(CrawlSpider): #scrapy.Spider
name = 'FnacCom'
allowed_domains = ['fnac.com']
start_urls = ['http://www.fnac.com/MORMANE/srefA5533119-3387-5EC4-82B6-AA61216BF599']
##### To extract links in order to run the spider in them
# rules = (
# Rule(LinkExtractor(allow=()), callback='parse'),
# )
def parse(self, response):
soup = BeautifulSoup(urlopen(response.url), "lxml")
iframexx = soup.find_all('iframe')
for iframe in iframexx:
yield scrapy.Request(iframe.attrs['src'],callback=self.parse2)
##### Main function
def parse1(self, response):
item1 = FnacItem()
nb_sales = response.xpath('//table[#summary="données détaillée du vendeur"]/tbody/tr/td/span/text()').extract()
country = response.xpath('//table[#summary="données détaillée du vendeur"]/tbody/tr/td/text()').extract()
yield scrapy.Request(url, meta={'item': item1}) #I don't know what to put instead of URL...
def parse2(self, response):
same_item = response.meta['item']
address = response.xpath('//div/p/text()').re(r'.*Adresse \: (.*)\n?.*')
email = response.xpath('//div/ul/li[contains(text(),"#")]/text()').extract()
name = response.xpath('//div/p[#class="customer-policy-label"]/text()').re(r'Infos sur la boutique \: ([a-zA-Z0-9]*)')
phone = response.xpath('//div/p/text()').re(r'.*Tél \: ([\d]*)\n?.*')
siret = response.xpath('//div/p/text()').re(r'.*Siret \: ([\d]*)\n?.*')
vat = response.xpath('//div/text()').re(r'.*TVA \: (.*)')
if (len(name) != 0):
item['name'] = ''.join(name).strip()
item['address'] = ''.join(address).strip()
item['phone'] = ''.join(phone).strip()
item['email'] = ''.join(email).strip()
item['nb_sales'] = ''.join(nb_sales).strip()
item['country'] = ''.join(country).strip()
item['vat'] = ''.join(vat).strip()
item['siret'] = ''.join(siret).strip()
return item
to combine information from different requests into a similar item, you have to use the meta parameter of the requests:
def parse1(self, response):
item1 = {
...
}
yield Request(url='another_url.com', meta={'item': item1}, callback=self.parse2)
def parse2(self, response):
same_item = response.meta['item']
# keep populating the item with the second response
...
yield same_item

Webcrawler multiple page iteration

I want to make the crawler go to the next page to extract data any help on what to do. I am a little lost on what to do. I tried scrapy but it is kinda complicated and bs4 is more convenient.
import bs4 as bs
import urllib.request
import pandas as pd
import re
source = urllib.request.urlopen('https://messageboards.webmd.com/').read()
soup = bs.BeautifulSoup(source,'lxml')
df = pd.DataFrame(columns = ['link'],data=[url.a.get('href') for url in soup.find_all('div',class_="link")])
lists=[]
for i in range(0,33):
link = (df.link.iloc[i])
source1 = urllib.request.urlopen(link).read()
soup1 = bs.BeautifulSoup(source1,'lxml')
for url1 in soup1.find_all('a',class_="next"):
next_link = soup1.find('a',href = True, text = re.compile("next"))
if next_link:
lists.append(link+url1.get('href'))
So it looks like you're storing hrefs in a list
for url1 in soup1.find_all('a',class_="next"):
next_link = soup1.find('a',href = True, text = re.compile("next"))
if next_link:
lists.append(link+url1.get('href'))
Now you actually have to do something with them. In this case I'm assuming you want to navigate to each href in your list.
for href in lists:
new_page = urllib.request.urlopen(href).read()
And then you can scrape whatever data you want out of new_page
I've got the same problem. Here is my code example for a page I crawled for exercise. I've chained multiple site requests to get detailed information.
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from capterra.items import CapterraItem
class CapterraCatSpider(CrawlSpider):
name = 'capterra_cat'
#allowed_domains = ['http://www.capterra.com/categories']
start_urls = ['http://www.capterra.com/categories']
# rules = (
# Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
# )
def parse(self, response):
#TEMP
for category in response.css('ol.browse-group-list'):
#Debug: only elements of one category
if category.css('a::text').extract_first() == 'Yoga Studio':
i = CapterraItem()
#Get link to detail page
i['cat_name'] = category.css('a::text').extract_first()
#join link to detail page with base url
i['cat_link'] = response.urljoin(category.css('a::attr(href)').extract_first())
cat_link = i['cat_link']
print cat_link
#call request to detail page and pass response to parse_details method with callback method
request = scrapy.Request(cat_link, callback=self.parse_details)
request.meta['item'] = i
yield request
def parse_details(self,response):
#Debug print
print 'DETAILS!'
#read your items from response meta
item = response.meta['item']
#iterate over listings
for detail in response.css('p.listing-description.milli'):
item['profile_link'] = response.urljoin(detail.css('a.spotlight-link::attr(href)').extract_first())
#call request to profile page to get more information for listing
request = scrapy.Request(item['profile_link'], callback=self.parse_profile)
#set your item to rquest metadata
request.meta['item'] = item
yield request
def parse_profile(self,response):
#Debug print
print 'PROFILE'
item = response.meta['item']
item['product_name'] = response.css('h1.beta.no-margin-bottom::text').extract_first()
item['who_uses_software'] = response.css('div.spotlight-target > p.epsilon > i::text').extract_first()
item['vendor_name'] = response.css('h2.spotlight-vendor-name > span::text').extract_first()
return item

Categories

Resources