Http requests freezes after severel requests - python

Okay, here is my code:
from lxml import html
from lxml import etree
from selenium import webdriver
import calendar
import math
import urllib
import progressbar
import requests
Using selenium
path_to_driver = '/home/vladislav/Shit/geckodriver'
browser = webdriver.Firefox(executable_path = path_to_driver)
Create a dict, where i store data and create progressbars
DataDict = {}
barY = progressbar.ProgressBar(max_value=progressbar.UnknownLength)
barM = progressbar.ProgressBar(max_value=progressbar.UnknownLength)
barW = progressbar.ProgressBar(max_value=progressbar.UnknownLength)
Forming parameters in a loop, constructing a url from them and send a browser.get request
for year in (range(2014,2016)):
barY.update(year)
for month in range(1,13):
barM.update(month)
weeks = math.ceil(calendar.monthrange(year,month)[1]/4)
for week in range(weeks):
barW.update(week)
if (week > 2):
start_day = 22
end_day = calendar.monthrange(year,month)[1]
else:
start_day =7*week + 1
end_day = 7*(week + 1)
start_date = str(year) + '-' + str(month).zfill(2) +'-' + str(start_day).zfill(2)
end_date = str(year) + '-' +str(month).zfill(2) + '-' + str(end_day).zfill(2)
params = {'end-date': end_date, 'start-date': start_date}
url = 'http://www.finam.ru/profile/moex-akcii/aeroflot/news'
url = url + ('&' if urllib.parse.urlparse(url).query else '?') + urllib.parse.urlencode(params)
The request itself
browser.get(url)
try:
news_list = browser.find_element_by_class_name('news-list')
news_list_text = news_list.text
news_list_text = news_list_text.split('\n')
for i in range(int(len(news_list_text)/2)):
DataDict.update({news_list_text[2*i]:news_list_text[2*i+1]})
print("Found! Adding news to the dictionary!")
except:
pass
But after 2-4 requests it just freezes:(
Whats the problem?

Okay, the problem was in an advertising banner, which appeared after several requests. Solution is just to wait (time.sleep), untill the banner disapeares, and the send request again!:
try:
browser.get(url)
try:
news_list = browser.find_element_by_class_name('news-list')
news_list_text = news_list.text
news_list_text = news_list_text.split('\n')
for i in range(int(len(news_list_text)/2)):
DataDict.update({news_list_text[2*i]:news_list_text[2*i+1]})
#print("Found! Adding news to the dictionary!")
except:
pass
time.sleep(10)
except:
print("perchaps this shitty AD?")
try:
news_list = browser.find_element_by_class_name('news-list')
news_list_text = news_list.text
news_list_text = news_list_text.split('\n')
for i in range(int(len(news_list_text)/2)):
DataDict.update({news_list_text[2*i]:news_list_text[2*i+1]})
#print("Found! Adding news to the dictionary!")
except:
pass

Related

Web scraping code using BS4+request not refreshing

I have a problem with a code that scrapes a weather website. It's supposed to update hourly, but for some reason, the data given is not the current data on the website; it also doesn't update its data, but keeps feeding the same data continuously. Please help!!!
Also, I need help scraping the weather icon from the site.
Here is my code:
from bs4 import BeautifulSoup
from plyer import notification
import requests
import time
if __name__ == '__main__':
while True:
def notifyMe(title, message):
notification.notify(
title = title,
message = message,
#app_icon = icon,
timeout = 7
)
try:
# site = requests.get('https://weather.com/weather/today/l/5.02,7.97?par=google')
site = requests.get('https://weather.com/en-NG/weather/today/l/4dce0117809bca3e9ecdaa65fb45961a9718d6829adeb72b6a670240e10bd8c9')
# site = requests.get('http://localhost/weather.com/weather/today/l/5.02,7.97.html')
soup = BeautifulSoup(site.content, 'html.parser')
day = soup.find(class_= 'CurrentConditions--CurrentConditions--14ztG')
location = day.find(class_='CurrentConditions--location--2_osB').get_text()
timestamp = day.find(class_='CurrentConditions--timestamp--3_-CV').get_text()
tempValue = day.find(class_='CurrentConditions--tempValue--1RYJJ').get_text()
phraseValue = day.find(class_='CurrentConditions--phraseValue--17s79').get_text()
precipValue = day.find(class_='CurrentConditions--precipValue--1RgXi').get_text()
#icon = day.find(id ='svg-symbol-cloud').get_icon()
weather = timestamp + "\n" + tempValue + " " + phraseValue + "\n" + precipValue
except requests.exceptions.ConnectionError:
location = "Couldn't get a location."
weather = "Error connecting to website."
except AttributeError:
weather = timestamp + "\n" + tempValue + " " + phraseValue
# print (weather)
notifyMe( location, weather )
time.sleep(30)
Expected output:
Uyo, Akwa Ibom Weather
As of 13:28 WAT
30° Mostly Cloudy
55% chance of rain until 14:00
import requests
from bs4 import BeautifulSoup
def main(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
x = list(soup.select_one('.card').stripped_strings)
del x[4:8]
print(x)
main('https://weather.com/en-NG/weather/today/l/4dce0117809bca3e9ecdaa65fb45961a9718d6829adeb72b6a670240e10bd8c9')
Output:
['Uyo, Akwa Ibom Weather', 'As of 8:03 WAT', '24°', 'Cloudy', '47% chance of rain until 9:00']
It appears the error might have been from the site, because it's working now without the issues. Thank you all for the suggestions. #Ahmed American your code is beautiful. I've learnt from it. #furas I'll try to construct the SVG as you suggested.
That's the output.

Python scheduling- how can I pulling data out of HTML every 5 min

I'm practicing to build a project. I want get the stock value every 5 min, if the price is the same with setting price, I will got an e-mail and desktop notification, but now I have some trouble...I don't know how to fix this...
import bs4
import requests
import schedule
import time
import smtplib
import email.message
from win10toast import ToastNotifier
from function import send_email
stock_no = input('Plesae insert stock no:')
set_price = input('Please set notification price:')
def job():
links = 'https://histock.tw/stock/%s' % stock_no
response = requests.get(links)
soup = bs4.BeautifulSoup(response.text, 'lxml')
tittle = soup.find('h3').get_text().strip()
li = soup.find('span', id="Price1_lbTPrice").span.get_text()
msg_text = tittle + 'stock value is ' + li
schedule.every(5).minutes.do(job)
while True:
schedule.run_pending()
time.sleep(1)
if set_price is li:
send_email(msg_text)
toaster = ToastNotifier()
toaster.show_toast("Stock value notification",
msg_text,
duration=10)
there's something wrong...like this
This is my problem
You have declared "li" and "msg_text" inside of the "job" function, that means that these variables are just available inside the "job" function.
There are many ways to solve this problem, I will just propose one to try to help you:
import bs4
import requests
import schedule
import time
import smtplib
import email.message
from win10toast import ToastNotifier
from function import send_email
stock_no = input('Please insert stock no:')
set_price = input('Please set notification price:')
def get_stock_price():
links = 'https://histock.tw/stock/%s' % stock_no
response = requests.get(links)
soup = bs4.BeautifulSoup(response.text, 'lxml')
tittle = soup.find('h3').get_text().strip()
li = soup.find('span', id="Price1_lbTPrice").span.get_text()
return li
schedule.every(5).minutes.do(job)
while True:
schedule.run_pending()
time.sleep(1)
current_price = get_stock_price()
if set_price == current_price:
msg_text = tittle + 'stock value is ' + current_price
send_email(msg_text)
toaster = ToastNotifier()
toaster.show_toast("Stock value notification",
msg_text,
duration=10)
I didn't test the code above but it may be useful to you understand the error that you posted.
Good luck and happy coding!
Finally, I fix this question from Tito's answer. I post these code, hope it can help somebody have the same question.
import bs4
import requests
import schedule
import time
import smtplib
import email.message
from win10toast import ToastNotifier
from function import send_email
stock_no = input('Please insert stock no:')
set_price = '%.2f' % int(input('Please set notification price:'))
def get_stock_price():
links = 'https://histock.tw/stock/%s' % stock_no
response = requests.get(links)
soup = bs4.BeautifulSoup(response.text, 'lxml')
tittle = soup.find('h3').get_text().strip()
li = soup.find('span', id="Price1_lbTPrice").span.get_text()
return li, tittle
schedule.every(5).minutes.do(get_stock_price)
while True:
try:
schedule.run_pending()
time.sleep(1)
current_price = get_stock_price()[0]
if set_price == current_price:
msg_text = get_stock_price()[1] + \
'stock value is ' + current_price
send_email(msg_text)
toaster = ToastNotifier()
toaster.show_toast("Stock value notification",
msg_text,
duration=10)
except:
print('It is not working...')

Finding when a webpage updates via Python?

So, I am scraping a webpage and I have a element on the page where it displays an integer, when I scrape that element, i store the plaintext in a variable, then each time it scrapes, i compare the variable to what the plaintext is on the webpage. I am not sure if maybe i need to get a request to the webpage each time?
from win10toast import ToastNotifier
from _overlapped import NULL
from plyer import notification
import requests
from bs4 import BeautifulSoup
toaster = ToastNotifier()
toaster.show_toast("Notification!", "Alert!", threaded=True, icon_path=NULL, duration=3)
URL = "https://rocketleague.tracker.network/rocket-league/profile/steam/76561198074072333/mmr?playlist=13"
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
_title = ""
_message = ""
recent_mmr = "111"
def get_mmr(url):
results = soup.find_all(class_="stat")
for stat in results :
titles = stat.find_all(class_="label")
for t in titles :
if(t.text.strip() == "Rating"):
val = stat.find(class_="value").text.strip()
return val
def get_rank(url):
results = soup.find(class_="stat tier")
rank = results.find(class_="label")
return rank.text.strip()
_message = "Rank: " + get_rank(URL) + "\n" + "MMR: " + get_mmr(URL)
recent_mmr = get_mmr(URL)
import time
while toaster.notification_active():
time.sleep(0.1)
notification.notify(
title="Ranked 3v3",
message= _message,
app_icon=NULL,
timeout=10
)
print(recent_mmr)
recent_mmr = get_mmr(URL)
while True:
print('running')
#page = requests.get(URL)
recent_mmr = get_mmr(URL)
mmr_temp = recent_mmr
print(mmr_temp +"(temp mmr)")
if mmr_temp == recent_mmr:
print("No update, recent MMR: " + recent_mmr)
mmr_temp = recent_mmr
time.sleep(60)
else:
notification.notify(
title="Ranked 3v3",
message= _message,
app_icon=NULL,
timeout=10
)
time.sleep(60)
recent_mmr = get_mmr(URL)
mmr_temp = recent_mmr
print("Updated, recent MMR: " + recent_mmr)
You're scraping the webpage to get the recent_mmr number, copying that to mmr_temp, and then immediately comparing to see if they're equal -- well of course they are, because you just copied it!
You need to reorganize the loop a little bit, and copy the mmr variable at the bottom of the loop:
previous_mmr = None
while True:
recent_mmr = get_mmr()
if recent_mmr != previous_mmr:
print("mmr changed")
previous_mmr = recent_mmr

Selenium WebScraping: Try get ProductList but always get same Product

I am trying to get a Product List of a website with selenium. I prototyped the program and everything worked perfectly but now I built a loop to get all products and it just gives me the same product 484 times(that's the number of products there are on the website)
Here is my code:
from bs4 import BeautifulSoup as soup # HTML data structure
from urllib.request import urlopen as uReq # Web client
import selenium
from selenium import webdriver
# URl to web scrape from
page_url = "https://www.smythstoys.com/at/de-at/spielzeug/lego/c/SM100114"
driver = webdriver.Chrome()
driver.get(page_url)
buttonName = "loadMoreProducts"
loadMoreButton = driver.find_element_by_id(buttonName)
while loadMoreButton is not None:
try:
try:
loadMoreButton.click()
except selenium.common.exceptions.ElementNotInteractableException:
break
except selenium.common.exceptions.ElementClickInterceptedException:
break
uClient = uReq(page_url)
page_soup = soup(uClient.read(), "html.parser")
uClient.close()
# gets all products
containers = driver.find_elements_by_tag_name('article')
print(len(containers))
# name the output file to write to local disk
out_filename = "smythstoys_product_data.csv"
# header of csv file to be written
headers = "product_name;price; info \n"
# opens file, and writes headers
f = open(out_filename, "w")
f.write(headers)
# loops trough all products
# -----------------------------------------------------------------------
# here is the problem:
for container in driver.find_elements_by_tag_name('article'):
print("----------------------------------------------------------------------")
product_name_container = container.find_element_by_xpath("//h2[#class ='prodName trackProduct']")
product_name = product_name_container.text
print(product_name)
price_container = container.find_element_by_xpath("//div[#class ='price']")
price = price_container.text
print("price:", price)
# ------------------------------------------------------------------------------------
try:
info_container = container.find_element_by_xpath("//span[#class ='decalImage-right']").text
print(info_container)
if not info_container:
info = "no special type"
print(info)
print(info_container)
f.write(product_name + "; " + price + "; " + info + "\n")
continue
if info_container == "https://smyths-at-prod-images.storage.googleapis.com/sys-master/images/hed/h5f/8823589830686" \
"/lego-hard-to-find-decal_CE.svg":
info = "seltenes Set"
elif info_container == "https://smyths-at-prod-images.storage.googleapis.com/sys-master/images/h41/h70" \
"/8823587930142/new-decal_CE%20%281%29.svg":
info = "neues Set"
elif info_container == "https://smyths-at-prod-images.storage.googleapis.com/sys-master/images/hde/hae" \
"/8871381303326/sale-decal_CE.svg":
info = "Sale"
else:
info = "unknown type" + info_container
print(info)
print(info_container)
except NameError:
print("no atribute")
if info_container is None:
info = "unknown type"
print(info)
# writes the dataset to file
f.write(product_name + "; " + price + "; " + info + "\n")
f.close() # Close the file
My output is:
LEGO Star Wars 75244 Tantive IV
price: 199,99€
no special type
and that 484x
I'm not sure why you used selenium to get the products when requests can do it smoothly. The following is something you wanna do to get all the products using requests.
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
link = "https://www.smythstoys.com/at/de-at/at/de-at/spielzeug/lego/c/SM100114/load-more?"
params = {'q':':bestsellerRating:productVisible:true','page':'1'}
p = 0
while True:
params['page'] = p
r = requests.get(link,params=params,headers={
'content-type': 'application/json; charset=utf-8'
})
soup = BeautifulSoup(r.json()['htmlContent'],"lxml")
if not soup.select_one("a.trackProduct[href]"):break
for item in soup.select("a.trackProduct[href]"):
product_name = item.select_one("h2.prodName").get_text(strip=True)
product_price = item.select_one("[itemprop='price']").get("content")
print(product_name,product_price)
p+=1

Web crawler not able to process more than one webpage

I am trying to extract some information about mtg cards from a webpage with the following program but I repeatedly retrieve information about the initial page given(InitUrl). The crawler is unable to proceed further. I have started to believe that i am not using the correct urls or maybe there is a restriction in using urllib that slipped my attention. Here is the code that i struggle with for weeks now:
import re
from math import ceil
from urllib.request import urlopen as uReq, Request
from bs4 import BeautifulSoup as soup
InitUrl = "https://mtgsingles.gr/search?q=dragon"
NumOfCrawledPages = 0
URL_Next = ""
NumOfPages = 4 # depth of pages to be retrieved
query = InitUrl.split("?")[1]
for i in range(0, NumOfPages):
if i == 0:
Url = InitUrl
else:
Url = URL_Next
print(Url)
UClient = uReq(Url) # downloading the url
page_html = UClient.read()
UClient.close()
page_soup = soup(page_html, "html.parser")
cards = page_soup.findAll("div", {"class": ["iso-item", "item-row-view"]})
for card in cards:
card_name = card.div.div.strong.span.contents[3].contents[0].replace("\xa0 ", "")
if len(card.div.contents) > 3:
cardP_T = card.div.contents[3].contents[1].text.replace("\n", "").strip()
else:
cardP_T = "Does not exist"
cardType = card.contents[3].text
print(card_name + "\n" + cardP_T + "\n" + cardType + "\n")
try:
URL_Next = InitUrl + "&page=" + str(i + 2)
print("The next URL is: " + URL_Next + "\n")
except IndexError:
print("Crawling process completed! No more infomation to retrieve!")
else:
NumOfCrawledPages += 1
Url = URL_Next
finally:
print("Moving to page : " + str(NumOfCrawledPages + 1) + "\n")
One of the reasons your code fail is, that you don't use cookies. The site seem to require these to allow paging.
A clean and simple way of extracting the data you're interested in would be like this:
import requests
from bs4 import BeautifulSoup
# the site actually uses this url under the hood for paging - check out Google Dev Tools
paging_url = "https://mtgsingles.gr/search?ajax=products-listing&lang=en&page={}&q=dragon"
return_list = []
# the page-scroll will only work when we support cookies
# so we fetch the page in a session
session = requests.Session()
session.get("https://mtgsingles.gr/")
All pages have a next button except the last one. So we use this knowledge to loop until the next-button goes away. When it does - meaning that the last page is reached - the button is replaced with a 'li'-tag with the class of 'next hidden'. This only exists on the last page
Now we're ready to start looping
page = 1 # set count for start page
keep_paging = True # use flag to end loop when last page is reached
while keep_paging:
print("[*] Extracting data for page {}".format(page))
r = session.get(paging_url.format(page))
soup = BeautifulSoup(r.text, "html.parser")
items = soup.select('.iso-item.item-row-view.clearfix')
for item in items:
name = item.find('div', class_='col-md-10').get_text().strip().split('\xa0')[0]
toughness_element = item.find('div', class_='card-power-toughness')
try:
toughness = toughness_element.get_text().strip()
except:
toughness = None
cardtype = item.find('div', class_='cardtype').get_text()
card_dict = {
"name": name,
"toughness": toughness,
"cardtype": cardtype
}
return_list.append(card_dict)
if soup.select('li.next.hidden'): # this element only exists if the last page is reached
keep_paging = False
print("[*] Scraper is done. Quitting...")
else:
page += 1
# do stuff with your list of dicts - e.g. load it into pandas and save it to a spreadsheet
This will scroll until no more pages exists - no matter how many subpages would be in the site.
My point in the comment above was merely that if you encounter an Exception in your code, your pagecount would never increase. That's probably not what you want to do, which is why I recommended you to learn a little more about the behaviour of the whole try-except-else-finally deal.
I am also bluffed, by the request given the same reply, ignoring the page parameter. As a dirty soulution I can offer you first to set up the page-size to a high enough number to get all the Items that you want (this parameter works for some reason...)
import re
from math import ceil
import requests
from bs4 import BeautifulSoup as soup
InitUrl = Url = "https://mtgsingles.gr/search"
NumOfCrawledPages = 0
URL_Next = ""
NumOfPages = 2 # depth of pages to be retrieved
query = "dragon"
cardSet=set()
for i in range(1, NumOfPages):
page_html = requests.get(InitUrl,params={"page":i,"q":query,"page-size":999})
print(page_html.url)
page_soup = soup(page_html.text, "html.parser")
cards = page_soup.findAll("div", {"class": ["iso-item", "item-row-view"]})
for card in cards:
card_name = card.div.div.strong.span.contents[3].contents[0].replace("\xa0 ", "")
if len(card.div.contents) > 3:
cardP_T = card.div.contents[3].contents[1].text.replace("\n", "").strip()
else:
cardP_T = "Does not exist"
cardType = card.contents[3].text
cardString=card_name + "\n" + cardP_T + "\n" + cardType + "\n"
cardSet.add(cardString)
print(cardString)
NumOfCrawledPages += 1
print("Moving to page : " + str(NumOfCrawledPages + 1) + " with " +str(len(cards)) +"(cards)\n")

Categories

Resources