Web scraping code using BS4+request not refreshing - python

I have a problem with a code that scrapes a weather website. It's supposed to update hourly, but for some reason, the data given is not the current data on the website; it also doesn't update its data, but keeps feeding the same data continuously. Please help!!!
Also, I need help scraping the weather icon from the site.
Here is my code:
from bs4 import BeautifulSoup
from plyer import notification
import requests
import time
if __name__ == '__main__':
while True:
def notifyMe(title, message):
notification.notify(
title = title,
message = message,
#app_icon = icon,
timeout = 7
)
try:
# site = requests.get('https://weather.com/weather/today/l/5.02,7.97?par=google')
site = requests.get('https://weather.com/en-NG/weather/today/l/4dce0117809bca3e9ecdaa65fb45961a9718d6829adeb72b6a670240e10bd8c9')
# site = requests.get('http://localhost/weather.com/weather/today/l/5.02,7.97.html')
soup = BeautifulSoup(site.content, 'html.parser')
day = soup.find(class_= 'CurrentConditions--CurrentConditions--14ztG')
location = day.find(class_='CurrentConditions--location--2_osB').get_text()
timestamp = day.find(class_='CurrentConditions--timestamp--3_-CV').get_text()
tempValue = day.find(class_='CurrentConditions--tempValue--1RYJJ').get_text()
phraseValue = day.find(class_='CurrentConditions--phraseValue--17s79').get_text()
precipValue = day.find(class_='CurrentConditions--precipValue--1RgXi').get_text()
#icon = day.find(id ='svg-symbol-cloud').get_icon()
weather = timestamp + "\n" + tempValue + " " + phraseValue + "\n" + precipValue
except requests.exceptions.ConnectionError:
location = "Couldn't get a location."
weather = "Error connecting to website."
except AttributeError:
weather = timestamp + "\n" + tempValue + " " + phraseValue
# print (weather)
notifyMe( location, weather )
time.sleep(30)
Expected output:
Uyo, Akwa Ibom Weather
As of 13:28 WAT
30° Mostly Cloudy
55% chance of rain until 14:00

import requests
from bs4 import BeautifulSoup
def main(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
x = list(soup.select_one('.card').stripped_strings)
del x[4:8]
print(x)
main('https://weather.com/en-NG/weather/today/l/4dce0117809bca3e9ecdaa65fb45961a9718d6829adeb72b6a670240e10bd8c9')
Output:
['Uyo, Akwa Ibom Weather', 'As of 8:03 WAT', '24°', 'Cloudy', '47% chance of rain until 9:00']

It appears the error might have been from the site, because it's working now without the issues. Thank you all for the suggestions. #Ahmed American your code is beautiful. I've learnt from it. #furas I'll try to construct the SVG as you suggested.
That's the output.

Related

How to output only relevant changes while scraping for new discounts?

In a previous question I got the answer from Hedgehog! (How to check for new discounts and send to telegram if changes detected?)
But another question is, how can I get only the new (products) items in the output and not all the text what is changed. My feeling is that the output I got is literally anything what is changed on the website and not only the new added discount.
Here is the code, and see the attachment what the output is. Thanks again for all the effort.
`# Import all necessary packages
import requests, time, difflib, os, re, schedule, cloudscraper
from bs4 import BeautifulSoup
from datetime import datetime
# Define scraper
scraper = cloudscraper.create_scraper()
# Send a message via a telegram bot
def telegram_bot_sendtext(bot_message):
bot_token = '1XXXXXXXXXXXXXXXXXXXXXXXXXXG5pses8'
bot_chatID = '-XXXXXXXXXXX'
send_text = 'https://api.telegram.org/bot' + bot_token + '/sendMessage?chat_id=' + bot_chatID
+ '&parse_mode=Markdown&text=' + bot_message
response = requests.get(send_text)
return response.json()
PrevVersion = ""
FirstRun = True
while True:
# Download the page with the specified URL
response = scraper.get("https://").content
# Url for in the messages to show
url = "https://"
# Act like a browser
#headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
# Parse the downloaded page and check for discount on the page
soup = BeautifulSoup(response, 'html.parser')
def get_discounts(soup):
for d in soup.select('.cept-discount'):
if d.text != '' and 65 < int(''.join(filter(str.isdigit, d.text))) < 99:
return True
else:
return False
# Remove all scripts and styles
for script in soup(["script", "style"]):
script.extract()
discounts = get_discounts(soup)
soup = soup.get_text()
# Compare the page text to the previous version and check if there are any discounts in your range
if PrevVersion != soup and discounts:
# On the first run - just memorize the page
if FirstRun == True:
PrevVersion = soup
FirstRun = False
print ("Start Monitoring "+url+ ""+ str(datetime.now()))
else:
print ("Changes detected at: "+ str(datetime.now()))
OldPage = PrevVersion.splitlines()
NewPage = soup.splitlines()
diff = difflib.context_diff(OldPage,NewPage,n=0)
out_text = "\n".join([ll.rstrip() for ll in '\n'.join(diff).splitlines() if ll.strip()])
print (out_text)
OldPage = NewPage
# Send a message with the telegram bot
telegram_bot_sendtext("Nieuwe prijsfout op Pepper " + url )
# print ('\n'.join(diff))
PrevVersion = soup
else:
print( "No Changes "+ str(datetime.now()))
time.sleep(5)
continue`
What happens?
As discussed, your assumptions are going in the right direction, all the changes identified by the difflib will be displayed.
It may be possible to adjust the content of difflib but I am sure that difflib is not absolutely necessary for this task.
How to fix?
First step is to upgrade get_discounts(soup) to not only check if discount is in range but also get information of the item itself, if you like to display or operate on later:
def get_discounts(soup):
discounts = []
for d in soup.select('.cept-discount'):
if d.text != '' and 65 < int(''.join(filter(str.isdigit, d.text))) < 99:
discounts.append({
'name':d.find_previous('strong').a.get('title'),
'url':d.find_previous('strong').a.get('href'),
'discount':d.text,
'price':d.parent.parent.select_one('.thread-price').text,
'bestprice':d.previous_sibling.text
})
return discounts
Second step is to check if there is a new discount, close to the difflib but more focused:
def compare_discounts(d1: list, d2: list):
diff = [i for i in d1 + d2 if i not in d1]
result = len(diff) == 0
if not result:
return diff
Last step is to react to changes from the discounts, if so it will print the urls from so you can go directly to the offert products.
Note Cause we have stored additional information in our list of dicts you can adjust the printing to get also the whole information or specific attributes
if newDiscounts:
#Send a message with the telegram bot
print('\n'.join([c['url'] for c in newDiscounts]))
telegram_bot_sendtext("Nieuwe prijsfout op Pepper " + url)
Example
import requests, time, difflib, os, re, schedule, cloudscraper
from bs4 import BeautifulSoup
from datetime import datetime
# Define scraper
scraper = cloudscraper.create_scraper()
# Send a message via a telegram bot
def telegram_bot_sendtext(bot_message):
bot_token = '1XXXXXXXXXXXXXXXXXXXXXXXXXXG5pses8'
bot_chatID = '-XXXXXXXXXXX'
send_text = 'https://api.telegram.org/bot' + bot_token + '/sendMessage?chat_id=' + bot_chatID + '&parse_mode=Markdown&text=' + bot_message
response = requests.get(send_text)
return response.json()
PrevVersion = ""
PrevDiscounts = []
FirstRun = True
def get_discounts(soup):
discounts = []
for d in soup.select('.cept-discount'):
if d.text != '' and 65 < int(''.join(filter(str.isdigit, d.text))) < 99:
discounts.append({
'name':d.find_previous('strong').a.get('title'),
'url':d.find_previous('strong').a.get('href'),
'discount':d.text,
'price':d.parent.parent.select_one('.thread-price').text,
'bestprice':d.previous_sibling.text
})
return discounts
def compare_discounts(d1: list, d2: list):
diff = [i for i in d1 + d2 if i not in d1]
result = len(diff) == 0
if not result:
return diff
while True:
# Download the page with the specified URL
response = requests.get("https://nl.pepper.com/nieuw").content
# Url for in the messages to show
url = "https://nl.pepper.com/nieuw"
# Parse the downloaded page and check for discount on the page
soup = BeautifulSoup(response, 'html.parser')
# Remove all scripts and styles
for script in soup(["script", "style"]):
script.extract()
discounts = get_discounts(soup)
souptext = soup.get_text()
# Compare the page text to the previous version and check if there are any discounts in your range
if PrevVersion != souptext and discounts:
# On the first run - just memorize the page
if FirstRun == True:
PrevVersion = souptext
PrevDiscounts = discounts
FirstRun = False
print ("Start Monitoring "+url+ ""+ str(datetime.now()))
else:
print ("Changes detected at: "+ str(datetime.now()))
newDiscounts = compare_discounts(PrevDiscounts,discounts)
if newDiscounts:
print('\n'.join([c['url'] for c in newDiscounts]))
#Send a message with the telegram bot
telegram_bot_sendtext("Nieuwe prijsfout op Pepper " + url)
else:
print('These are general changes but there are no new discounts available.')
PrevVersion = souptext
PrevDiscounts = discounts
else:
print( "No Changes "+ str(datetime.now()))
time.sleep(10)
continue
Output
Start Monitoring https://nl.pepper.com/nieuw 2021-12-12 12:28:38.391028
No Changes 2021-12-12 12:28:54.009881
Changes detected at: 2021-12-12 12:29:04.429961
https://nl.pepper.com/aanbiedingen/gigaset-plug-startpakket-221003
No Changes 2021-12-12 12:29:14.698933
No Changes 2021-12-12 12:29:24.985394
No Changes 2021-12-12 12:29:35.271794
No Changes 2021-12-12 12:29:45.629790
No Changes 2021-12-12 12:29:55.917246
Changes detected at: 2021-12-12 12:30:06.184814
These are general changes but there are no new discounts available.

Finding when a webpage updates via Python?

So, I am scraping a webpage and I have a element on the page where it displays an integer, when I scrape that element, i store the plaintext in a variable, then each time it scrapes, i compare the variable to what the plaintext is on the webpage. I am not sure if maybe i need to get a request to the webpage each time?
from win10toast import ToastNotifier
from _overlapped import NULL
from plyer import notification
import requests
from bs4 import BeautifulSoup
toaster = ToastNotifier()
toaster.show_toast("Notification!", "Alert!", threaded=True, icon_path=NULL, duration=3)
URL = "https://rocketleague.tracker.network/rocket-league/profile/steam/76561198074072333/mmr?playlist=13"
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
_title = ""
_message = ""
recent_mmr = "111"
def get_mmr(url):
results = soup.find_all(class_="stat")
for stat in results :
titles = stat.find_all(class_="label")
for t in titles :
if(t.text.strip() == "Rating"):
val = stat.find(class_="value").text.strip()
return val
def get_rank(url):
results = soup.find(class_="stat tier")
rank = results.find(class_="label")
return rank.text.strip()
_message = "Rank: " + get_rank(URL) + "\n" + "MMR: " + get_mmr(URL)
recent_mmr = get_mmr(URL)
import time
while toaster.notification_active():
time.sleep(0.1)
notification.notify(
title="Ranked 3v3",
message= _message,
app_icon=NULL,
timeout=10
)
print(recent_mmr)
recent_mmr = get_mmr(URL)
while True:
print('running')
#page = requests.get(URL)
recent_mmr = get_mmr(URL)
mmr_temp = recent_mmr
print(mmr_temp +"(temp mmr)")
if mmr_temp == recent_mmr:
print("No update, recent MMR: " + recent_mmr)
mmr_temp = recent_mmr
time.sleep(60)
else:
notification.notify(
title="Ranked 3v3",
message= _message,
app_icon=NULL,
timeout=10
)
time.sleep(60)
recent_mmr = get_mmr(URL)
mmr_temp = recent_mmr
print("Updated, recent MMR: " + recent_mmr)
You're scraping the webpage to get the recent_mmr number, copying that to mmr_temp, and then immediately comparing to see if they're equal -- well of course they are, because you just copied it!
You need to reorganize the loop a little bit, and copy the mmr variable at the bottom of the loop:
previous_mmr = None
while True:
recent_mmr = get_mmr()
if recent_mmr != previous_mmr:
print("mmr changed")
previous_mmr = recent_mmr

Selenium WebScraping: Try get ProductList but always get same Product

I am trying to get a Product List of a website with selenium. I prototyped the program and everything worked perfectly but now I built a loop to get all products and it just gives me the same product 484 times(that's the number of products there are on the website)
Here is my code:
from bs4 import BeautifulSoup as soup # HTML data structure
from urllib.request import urlopen as uReq # Web client
import selenium
from selenium import webdriver
# URl to web scrape from
page_url = "https://www.smythstoys.com/at/de-at/spielzeug/lego/c/SM100114"
driver = webdriver.Chrome()
driver.get(page_url)
buttonName = "loadMoreProducts"
loadMoreButton = driver.find_element_by_id(buttonName)
while loadMoreButton is not None:
try:
try:
loadMoreButton.click()
except selenium.common.exceptions.ElementNotInteractableException:
break
except selenium.common.exceptions.ElementClickInterceptedException:
break
uClient = uReq(page_url)
page_soup = soup(uClient.read(), "html.parser")
uClient.close()
# gets all products
containers = driver.find_elements_by_tag_name('article')
print(len(containers))
# name the output file to write to local disk
out_filename = "smythstoys_product_data.csv"
# header of csv file to be written
headers = "product_name;price; info \n"
# opens file, and writes headers
f = open(out_filename, "w")
f.write(headers)
# loops trough all products
# -----------------------------------------------------------------------
# here is the problem:
for container in driver.find_elements_by_tag_name('article'):
print("----------------------------------------------------------------------")
product_name_container = container.find_element_by_xpath("//h2[#class ='prodName trackProduct']")
product_name = product_name_container.text
print(product_name)
price_container = container.find_element_by_xpath("//div[#class ='price']")
price = price_container.text
print("price:", price)
# ------------------------------------------------------------------------------------
try:
info_container = container.find_element_by_xpath("//span[#class ='decalImage-right']").text
print(info_container)
if not info_container:
info = "no special type"
print(info)
print(info_container)
f.write(product_name + "; " + price + "; " + info + "\n")
continue
if info_container == "https://smyths-at-prod-images.storage.googleapis.com/sys-master/images/hed/h5f/8823589830686" \
"/lego-hard-to-find-decal_CE.svg":
info = "seltenes Set"
elif info_container == "https://smyths-at-prod-images.storage.googleapis.com/sys-master/images/h41/h70" \
"/8823587930142/new-decal_CE%20%281%29.svg":
info = "neues Set"
elif info_container == "https://smyths-at-prod-images.storage.googleapis.com/sys-master/images/hde/hae" \
"/8871381303326/sale-decal_CE.svg":
info = "Sale"
else:
info = "unknown type" + info_container
print(info)
print(info_container)
except NameError:
print("no atribute")
if info_container is None:
info = "unknown type"
print(info)
# writes the dataset to file
f.write(product_name + "; " + price + "; " + info + "\n")
f.close() # Close the file
My output is:
LEGO Star Wars 75244 Tantive IV
price: 199,99€
no special type
and that 484x
I'm not sure why you used selenium to get the products when requests can do it smoothly. The following is something you wanna do to get all the products using requests.
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
link = "https://www.smythstoys.com/at/de-at/at/de-at/spielzeug/lego/c/SM100114/load-more?"
params = {'q':':bestsellerRating:productVisible:true','page':'1'}
p = 0
while True:
params['page'] = p
r = requests.get(link,params=params,headers={
'content-type': 'application/json; charset=utf-8'
})
soup = BeautifulSoup(r.json()['htmlContent'],"lxml")
if not soup.select_one("a.trackProduct[href]"):break
for item in soup.select("a.trackProduct[href]"):
product_name = item.select_one("h2.prodName").get_text(strip=True)
product_price = item.select_one("[itemprop='price']").get("content")
print(product_name,product_price)
p+=1

Web crawler not able to process more than one webpage

I am trying to extract some information about mtg cards from a webpage with the following program but I repeatedly retrieve information about the initial page given(InitUrl). The crawler is unable to proceed further. I have started to believe that i am not using the correct urls or maybe there is a restriction in using urllib that slipped my attention. Here is the code that i struggle with for weeks now:
import re
from math import ceil
from urllib.request import urlopen as uReq, Request
from bs4 import BeautifulSoup as soup
InitUrl = "https://mtgsingles.gr/search?q=dragon"
NumOfCrawledPages = 0
URL_Next = ""
NumOfPages = 4 # depth of pages to be retrieved
query = InitUrl.split("?")[1]
for i in range(0, NumOfPages):
if i == 0:
Url = InitUrl
else:
Url = URL_Next
print(Url)
UClient = uReq(Url) # downloading the url
page_html = UClient.read()
UClient.close()
page_soup = soup(page_html, "html.parser")
cards = page_soup.findAll("div", {"class": ["iso-item", "item-row-view"]})
for card in cards:
card_name = card.div.div.strong.span.contents[3].contents[0].replace("\xa0 ", "")
if len(card.div.contents) > 3:
cardP_T = card.div.contents[3].contents[1].text.replace("\n", "").strip()
else:
cardP_T = "Does not exist"
cardType = card.contents[3].text
print(card_name + "\n" + cardP_T + "\n" + cardType + "\n")
try:
URL_Next = InitUrl + "&page=" + str(i + 2)
print("The next URL is: " + URL_Next + "\n")
except IndexError:
print("Crawling process completed! No more infomation to retrieve!")
else:
NumOfCrawledPages += 1
Url = URL_Next
finally:
print("Moving to page : " + str(NumOfCrawledPages + 1) + "\n")
One of the reasons your code fail is, that you don't use cookies. The site seem to require these to allow paging.
A clean and simple way of extracting the data you're interested in would be like this:
import requests
from bs4 import BeautifulSoup
# the site actually uses this url under the hood for paging - check out Google Dev Tools
paging_url = "https://mtgsingles.gr/search?ajax=products-listing&lang=en&page={}&q=dragon"
return_list = []
# the page-scroll will only work when we support cookies
# so we fetch the page in a session
session = requests.Session()
session.get("https://mtgsingles.gr/")
All pages have a next button except the last one. So we use this knowledge to loop until the next-button goes away. When it does - meaning that the last page is reached - the button is replaced with a 'li'-tag with the class of 'next hidden'. This only exists on the last page
Now we're ready to start looping
page = 1 # set count for start page
keep_paging = True # use flag to end loop when last page is reached
while keep_paging:
print("[*] Extracting data for page {}".format(page))
r = session.get(paging_url.format(page))
soup = BeautifulSoup(r.text, "html.parser")
items = soup.select('.iso-item.item-row-view.clearfix')
for item in items:
name = item.find('div', class_='col-md-10').get_text().strip().split('\xa0')[0]
toughness_element = item.find('div', class_='card-power-toughness')
try:
toughness = toughness_element.get_text().strip()
except:
toughness = None
cardtype = item.find('div', class_='cardtype').get_text()
card_dict = {
"name": name,
"toughness": toughness,
"cardtype": cardtype
}
return_list.append(card_dict)
if soup.select('li.next.hidden'): # this element only exists if the last page is reached
keep_paging = False
print("[*] Scraper is done. Quitting...")
else:
page += 1
# do stuff with your list of dicts - e.g. load it into pandas and save it to a spreadsheet
This will scroll until no more pages exists - no matter how many subpages would be in the site.
My point in the comment above was merely that if you encounter an Exception in your code, your pagecount would never increase. That's probably not what you want to do, which is why I recommended you to learn a little more about the behaviour of the whole try-except-else-finally deal.
I am also bluffed, by the request given the same reply, ignoring the page parameter. As a dirty soulution I can offer you first to set up the page-size to a high enough number to get all the Items that you want (this parameter works for some reason...)
import re
from math import ceil
import requests
from bs4 import BeautifulSoup as soup
InitUrl = Url = "https://mtgsingles.gr/search"
NumOfCrawledPages = 0
URL_Next = ""
NumOfPages = 2 # depth of pages to be retrieved
query = "dragon"
cardSet=set()
for i in range(1, NumOfPages):
page_html = requests.get(InitUrl,params={"page":i,"q":query,"page-size":999})
print(page_html.url)
page_soup = soup(page_html.text, "html.parser")
cards = page_soup.findAll("div", {"class": ["iso-item", "item-row-view"]})
for card in cards:
card_name = card.div.div.strong.span.contents[3].contents[0].replace("\xa0 ", "")
if len(card.div.contents) > 3:
cardP_T = card.div.contents[3].contents[1].text.replace("\n", "").strip()
else:
cardP_T = "Does not exist"
cardType = card.contents[3].text
cardString=card_name + "\n" + cardP_T + "\n" + cardType + "\n"
cardSet.add(cardString)
print(cardString)
NumOfCrawledPages += 1
print("Moving to page : " + str(NumOfCrawledPages + 1) + " with " +str(len(cards)) +"(cards)\n")

Http requests freezes after severel requests

Okay, here is my code:
from lxml import html
from lxml import etree
from selenium import webdriver
import calendar
import math
import urllib
import progressbar
import requests
Using selenium
path_to_driver = '/home/vladislav/Shit/geckodriver'
browser = webdriver.Firefox(executable_path = path_to_driver)
Create a dict, where i store data and create progressbars
DataDict = {}
barY = progressbar.ProgressBar(max_value=progressbar.UnknownLength)
barM = progressbar.ProgressBar(max_value=progressbar.UnknownLength)
barW = progressbar.ProgressBar(max_value=progressbar.UnknownLength)
Forming parameters in a loop, constructing a url from them and send a browser.get request
for year in (range(2014,2016)):
barY.update(year)
for month in range(1,13):
barM.update(month)
weeks = math.ceil(calendar.monthrange(year,month)[1]/4)
for week in range(weeks):
barW.update(week)
if (week > 2):
start_day = 22
end_day = calendar.monthrange(year,month)[1]
else:
start_day =7*week + 1
end_day = 7*(week + 1)
start_date = str(year) + '-' + str(month).zfill(2) +'-' + str(start_day).zfill(2)
end_date = str(year) + '-' +str(month).zfill(2) + '-' + str(end_day).zfill(2)
params = {'end-date': end_date, 'start-date': start_date}
url = 'http://www.finam.ru/profile/moex-akcii/aeroflot/news'
url = url + ('&' if urllib.parse.urlparse(url).query else '?') + urllib.parse.urlencode(params)
The request itself
browser.get(url)
try:
news_list = browser.find_element_by_class_name('news-list')
news_list_text = news_list.text
news_list_text = news_list_text.split('\n')
for i in range(int(len(news_list_text)/2)):
DataDict.update({news_list_text[2*i]:news_list_text[2*i+1]})
print("Found! Adding news to the dictionary!")
except:
pass
But after 2-4 requests it just freezes:(
Whats the problem?
Okay, the problem was in an advertising banner, which appeared after several requests. Solution is just to wait (time.sleep), untill the banner disapeares, and the send request again!:
try:
browser.get(url)
try:
news_list = browser.find_element_by_class_name('news-list')
news_list_text = news_list.text
news_list_text = news_list_text.split('\n')
for i in range(int(len(news_list_text)/2)):
DataDict.update({news_list_text[2*i]:news_list_text[2*i+1]})
#print("Found! Adding news to the dictionary!")
except:
pass
time.sleep(10)
except:
print("perchaps this shitty AD?")
try:
news_list = browser.find_element_by_class_name('news-list')
news_list_text = news_list.text
news_list_text = news_list_text.split('\n')
for i in range(int(len(news_list_text)/2)):
DataDict.update({news_list_text[2*i]:news_list_text[2*i+1]})
#print("Found! Adding news to the dictionary!")
except:
pass

Categories

Resources