u have tried to run my program , but each time im getting error in the middile of the run
basiclly, my program does this :
1. get the xml from my website
2. run all the urls
3. get data from my web page (sku,name,title, price etc)
4. get the lowest price from another website, by compraring the price with the same sku
the problem is that i have more then 7,000 urls in my xml ,so my program get error network each time
what to do ? how can i resolve it ?
def parse_sitemap (url):
resp = requests.get(XXXX)
for u in urls:
loc = u.find ('loc').string
# not a sitemap requirement skip if not present
out.append ([loc])
return out
def get_sku (u):
html = requests.get(u)
bsObj = BeautifulSoup(html.content,'xml')
sku = bsObj.find('span',attrs={'itemprop':'sku'}).get_text()
return sku
def get_price ( u):
try:
html = requests.get(u)
bsObj = BeautifulSoup(html.content,'xml')
price = bsObj.find('span',attrs={'itemprop':'price'}).get_text()
price = str(price).replace(' ₪','')
return price
except:
return 'no price'
def get_zapPrice (makat):
try:
check ='https://www.zap.co.il/search.aspx?keyword='+makat
r = requests.get(check)
html = requests.get(r.url)
bsObj = BeautifulSoup(html.content,'html.parser')
zapPrice = bsObj.select_one('div.StoresLines div.PriceNum').text.strip().replace(' ₪','')
return zapPrice
except:
return 'no zap product'
def get_zapStoreName (makat):
try:
check ='https://www.zap.co.il/search.aspx?keyword='+makat
r = requests.get(check)
html = requests.get(r.url)
bsObj = BeautifulSoup(html.content,'html.parser')
storeName = bsObj.select_one('div.StoresLines
div.BuyButtonsTxt').text.strip().replace('ב-','')
return storeName
except:
return 'no zap product'
for u in urls:
ws1 [ 'A1' ] = u
makat = get_sku(u)
ws1 [ 'F1' ] = makat
zapPrice = get_zapPrice(makat)
ws1['I1'] = zapPrice
storeName = get_zapStoreName(makat)
ws1['J1'] = storeName
ws1.insert_rows(1)
ws1.append ([])
print("writing product no." + str(i))
ws1['F1'] = 'makat'
ws1['I1'] = 'zap price'
ws1['J1'] = 'zap store'
wb.save ("sample.xlsx")
wb.close ()
print ('end')
i didn't write all my code - by the basic is here
each def it's start with requests.get, get what i want and return it
after that, i'm writing it to excel file
the problem that im getting after 1,000 urls checks ...
what is the problem ??
Related
as of right now i have a working code which is a web scraper that logs into indeed job search site. My issue now is tha I need to create a csv file that shows every single job position that was found, it gives me the numer of positions available and the description of one of them. Hope i can get some help, I would greatly apreciate it.
import re
import csv
import requests
from bs4 import BeautifulSoup
from time import sleep
from random import randint
jk_pattern = re.compile(r"jk:\'([a-zA-Z0-9]+)'")
params = { "q": "mechanical+engineer", "l": "united+states", "start": 0 }
url = "https://www.indeed.com/jobs"
job_keys = set()
for x in range(10):
response = requests.get(url, params=params)
if not response.status_code == 200:
break
else:
keys = jk_pattern.findall(response.text)
if len(keys) > 0:
for key in keys:
job_keys.add(key)
params['start'] += 20
sleep(randint(0, 3))
len(job_keys)
template = "https://www.indeed.com/viewjob?jk={}"
jk = job_keys.pop()
job_url = template.format(jk)
response = requests.get(job_url)
soup = BeautifulSoup(response.text, 'html.parser')
print(soup.find("div", id="jobDescriptionText").text)
def get_record(card):
"""Extract job data from a single record"""
job_title = card.h2.a.get('title')
company = card.find('span', 'company').text.strip()
job_location = card.find('div', 'recJobLoc').get('data-rc-loc')
post_date = card.find('span', 'date').text
today = datetime.today().strftime('%Y-%m-%d')
summary = card.find('div', 'summary').text.strip().replace('\n', ' ')
job_url = 'https://www.indeed.com' + card.h2.a.get('href')
# this does not exists for all jobs, so handle the exceptions
salary_tag = card.find('span', 'salaryText')
if salary_tag:
salary = salary_tag.text.strip()
else:
salary = ''
record = (job_title, company, job_location, post_date, today, summary, salary, job_url)
return record
def main(position, location):
"""Run the main program routine"""
records = []
url = get_url(position, location)
# extract the job data
while True:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
cards = soup.find_all('div', 'jobsearch-SerpJobCard')
for card in cards:
record = get_record(card)
records.append(record)
try:
url = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
except AttributeError:
break
# save the job data
with open('results.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['JobTitle', 'Company', 'Location', 'PostDate', 'ExtractDate', 'Summary', 'Salary', 'JobUrl'])
writer.writerows(records)
I am building a scraper for Ebay. I am trying to figure out a way to manipulate the page number portion of the Ebay url to go to the next page until there are no more pages (If you were on page 2 the page number portion would look like "_pgn=2"). I noticed that if you put any number greater than the max number of pages a listing has, the page will reload to the last page, not give like a page doesn't exist error. (If a listing has 5 pages, then the last listing' page number url portion of _pgn=5 would rout to the same page if the page number url portion was _pgn=100). How can I implement a way to start at page one, get the html soup of the page, get the all relevant data I want from the soup, then load up the next page with the new page number and start the process again until there are not any new pages to scrape? I tried to get the number of results a listing has by using selenium xpath and math.ceil the quotient of number of results and 50 (default number of max listings per page) and use that quotient as my max_page, but I get errors saying the element doesn't exist even though it does. self.driver.findxpath('xpath').text. That 243 is what I am trying to get with the xpath.
class EbayScraper(object):
def __init__(self, item, buying_type):
self.base_url = "https://www.ebay.com/sch/i.html?_nkw="
self.driver = webdriver.Chrome(r"chromedriver.exe")
self.item = item
self.buying_type = buying_type + "=1"
self.url_seperator = "&_sop=12&rt=nc&LH_"
self.url_seperator2 = "&_pgn="
self.page_num = "1"
def getPageUrl(self):
if self.buying_type == "Buy It Now=1":
self.buying_type = "BIN=1"
self.item = self.item.replace(" ", "+")
url = self.base_url + self.item + self.url_seperator + self.buying_type + self.url_seperator2 + self.page_num
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
return soup
def getInfo(self, soup):
for listing in soup.find_all("li", {"class": "s-item"}):
raw = listing.find_all("a", {"class": "s-item__link"})
if raw:
raw_price = listing.find_all("span", {"class": "s-item__price"})[0]
raw_title = listing.find_all("h3", {"class": "s-item__title"})[0]
raw_link = listing.find_all("a", {"class": "s-item__link"})[0]
raw_condition = listing.find_all("span", {"class": "SECONDARY_INFO"})[0]
condition = raw_condition.text
price = float(raw_price.text[1:])
title = raw_title.text
link = raw_link['href']
print(title)
print(condition)
print(price)
if self.buying_type != "BIN=1":
raw_time_left = listing.find_all("span", {"class": "s-item__time-left"})[0]
time_left = raw_time_left.text[:-4]
print(time_left)
print(link)
print('\n')
if __name__ == '__main__':
item = input("Item: ")
buying_type = input("Buying Type (e.g, 'Buy It Now' or 'Auction'): ")
instance = EbayScraper(item, buying_type)
page = instance.getPageUrl()
instance.getInfo(page)
if you want to iterate all pages and gather all results then your script needs to check if there is a next page after you visit the page
import requests
from bs4 import BeautifulSoup
class EbayScraper(object):
def __init__(self, item, buying_type):
...
self.currentPage = 1
def get_url(self, page=1):
if self.buying_type == "Buy It Now=1":
self.buying_type = "BIN=1"
self.item = self.item.replace(" ", "+")
# _ipg=200 means that expect a 200 items per page
return '{}{}{}{}{}{}&_ipg=200'.format(
self.base_url, self.item, self.url_seperator, self.buying_type,
self.url_seperator2, page
)
def page_has_next(self, soup):
container = soup.find('ol', 'x-pagination__ol')
currentPage = container.find('li', 'x-pagination__li--selected')
next_sibling = currentPage.next_sibling
if next_sibling is None:
print(container)
return next_sibling is not None
def iterate_page(self):
# this will loop if there are more pages otherwise end
while True:
page = instance.getPageUrl(self.currentPage)
instance.getInfo(page)
if self.page_has_next(page) is False:
break
else:
self.currentPage += 1
def getPageUrl(self, pageNum):
url = self.get_url(pageNum)
print('page: ', url)
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
return soup
def getInfo(self, soup):
...
if __name__ == '__main__':
item = input("Item: ")
buying_type = input("Buying Type (e.g, 'Buy It Now' or 'Auction'): ")
instance = EbayScraper(item, buying_type)
instance.iterate_page()
the important functions here are page_has_next and iterate_page
page_has_next - a function that check if the pagination of the page has another li element next to the selected page. e.g < 1 2 3 > if we are on page 1 then it checks if there is 2 next -> something like this
iterate_page - a function that loop until there is no page_next
also note that you don't need selenium for this unless you need to mimic user clicks or need a browser to navigate.
I am trying to scrape this website (that has multiple pages), using scrapy. the problem is that I can't find the next page URL.
Do you have an idea on how to scrape a website with multiple pages (with scrapy) or how to solve the error I'm getting with my code?
I tried the code below but it's not working:
class AbcdspiderSpider(scrapy.Spider):
"""
Class docstring
"""
name = 'abcdspider'
allowed_domains = ['abcd-terroir.smartrezo.com']
alphabet = list(string.ascii_lowercase)
url = "https://abcd-terroir.smartrezo.com/n31-france/annuaireABCD.html?page=1&spe=1&anIDS=31&search="
start_urls = [url + letter for letter in alphabet]
main_url = "https://abcd-terroir.smartrezo.com/n31-france/"
crawl_datetime = str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
start_time = datetime.datetime.now()
def parse(self, response):
self.crawler.stats.set_value("start_time", self.start_time)
try:
page = response.xpath('//div[#class="pageStuff"]/span/text()').get()
page_max = get_num_page(page)
for index in range(page_max):
producer_list = response.xpath('//div[#class="clearfix encart_ann"]/#onclick').getall()
for producer in producer_list:
link_producer = self.main_url + producer
yield scrapy.Request(url=link_producer, callback=self.parse_details)
next_page_url = "/annuaireABCD.html?page={}&spe=1&anIDS=31&search=".format(index)
if next_page_url is not None:
yield scrapy.Request(response.urljoin(self.main_url + next_page_url))
except Exception as e:
self.crawler.stats.set_value("error", e.args)
I am getting this error:
'error': ('range() integer end argument expected, got unicode.',)
The error is here:
page = response.xpath('//div[#class="pageStuff"]/span/text()').get()
page_max = get_num_page(page)
The range function expected an integer value (1,2,3,4, etc) not an unicode string ('Page 1 / 403'
)
My proposal for the range error is
page = response.xpath('//div[#class="pageStuff"]/span/text()').get().split('/ ')[1]
for index in range(int(page)):
#your actions
I am trying to obtain a stock's current price, and then put it into a variable to run if / else statements on. I have used the Google API to retrieve current stock prices, but I am unable to figure out how to put it into a variable. Thanks!
import json
import sys
try:
from urllib.request import Request, urlopen
except ImportError: #python 2
from urllib2 import Request, urlopen
googleFinanceKeyToFullName = {
u'id' : u'ID',
u't' : u'StockSymbol',
u'e' : u'Index',
u'l' : u'LastTradePrice',
u'l_cur' : u'LastTradeWithCurrency',
u'ltt' : u'LastTradeTime',
u'lt_dts' : u'LastTradeDateTime',
u'lt' : u'LastTradeDateTimeLong',
u'div' : u'Dividend',
u'yld' : u'Yield'
}
def buildUrl(symbols):
symbol_list = ','.join([symbol for symbol in symbols])
#a deprecated but still active & correct api
return 'http://finance.google.com/finance/info?client=ig&q=' \
+ symbol_list
def request(symbols):
url = buildUrl(symbols)
req = Request(url)
resp = urlopen(req)
#remove special symbols such as the pound symbol
content = resp.read().decode('ascii', 'ignore').strip()
content = content[3:]
return content
def replaceKeys(quotes):
global googleFinanceKeyToFullName
quotesWithReadableKey = []
for q in quotes:
qReadableKey = {}
for k in googleFinanceKeyToFullName:
if k in q:
qReadableKey[googleFinanceKeyToFullName[k]] = q[k]
quotesWithReadableKey.append(qReadableKey)
return quotesWithReadableKey
def getQuotes(symbols):
if type(symbols) == type('str'):
symbols = [symbols]
content = json.loads(request(symbols))
return replaceKeys(content);
if __name__ == '__main__':
try:
symbols = sys.argv[1]
except:
symbols = "GOOG,AAPL,MSFT,AMZN,SBUX"
symbols = symbols.split(',')
try:
print(json.dumps(getQuotes(symbols), indent=2))
except:
print("Fail")
You can get the last current stock price from the dictionary and put it into a variable, say price,
by changing the last part of the code to
try:
quotes = getQuotes(symbols)
price = quotes[-1]['LastTradePrice'] # -1 means last in a list
print(price)
except Exception as e:
print(e)
but it is very unreliable because if the order of prices is changed, you will get a price for a different stock.
What you should do is to learn how to define a data structure that's suitable ro solve your problem.
So I was just wondering what my getURLs function's issue might be. I'm trying to get all urls from within the containing body's string.
My crawler isn't crawling anything because my input urls are invalid.
# Get all URLs contained within the body string
def getURLs(body):
urls = []
tempArr = body.split("a href=")
index = 1
for part in tempArr:
if part[0] == '"':
while (part[index] != '"' and index < len(part)):
index += 1
if index < len(part):
urls.append(part[1:index-1])
index = 1
return urls
# Open file which contains input urls
with open("test_urls.txt","rU") as infile:
urls = [row.strip("\n") for row in infile]
class BackpageSpider(CrawlSpider):
name = 'backpage'
allowed_domains = ['backpage.com']
start_urls = urls
def parse(self,response):
#print response.url
if response.status < 600:
# all_links = response.xpath("//div[contains(#class,'cat')]/a/#href").extract()
#all the links FOR THE ESCORTS on whatever page we're on
todays_links = []
#all the links for today's date
backpage_date = backpage_date_today()
yesterday_date = backpage_date_yesterday()
if backpage_date in response.body:
todays_section = response.body.split(backpage_date)[1].split(yesterday_date)[0].decode('utf-8')
# todays_links = todays_section.xpath("//div[contains(#class,'cat')]/a/#href").extract
todays_links = getURLs(todays_section)
# for url in todays_links:
# todays_links.append(url)
# for url in all_links:
# if url in todays_section:
# todays_links.append(url)
for url in todays_links:
yield scrapy.Request(url,callback=self.parse_ad_into_content)####HERE
for url in set(response.xpath('//a[#class="pagination next"]/#href').extract()):
yield scrapy.Request(url,callback=self.parse)
else:
time.sleep(600)
yield scrapy.Request(response.url,callback=self.parse)
def parse_ad_into_content(self,response):
#ipdb.set_trace()
item = items.BackpageScrapeItem(
url=response.url,
backpage_id=response.url.split('.')[0].split('/')[2].encode('utf-8'),
text = response.body,
posting_body= response.xpath("//div[#class='postingBody']").extract()[0].encode('utf-8'),
date = datetime.utcnow()-timedelta(hours=5),
posted_date = response.xpath("//div[#class='adInfo']/text()").extract()[0].encode('utf-8'),
posted_age = response.xpath("//p[#class='metaInfoDisplay']/text()").extract()[0].encode('utf-8'),
posted_title = response.xpath("//div[#id='postingTitle']//h1/text()").extract()[0].encode('utf-8')
)
return item
The web page is: http://grandisland.backpage.com/FemaleEscorts/?layout=date