Python /bs4: trying to print temperature/city from a local website - python

I'm trying to get and print the current weather temperature and city name from a local website, but no success.
All I need it to read and print the city (Lodrina), the Temperature (23.1C) and if possible the title in ca-cond-firs ("Temperatura em declínio") - this last one changes as temps goes up or down...
This is the html section of the site:
THIS IS THE HTML (the part of matters:)
#<div class="ca-cidade">Londrina</div>
<ul class="ca-condicoes">
<li class="ca-cond-firs"><img src="/site/imagens/icones_condicoes/temperatura/temp_baixa.png" title="Temperatura em declínio"/><br/>23.1°C</li>
<li class="ca-cond"><img src="/site/imagens/icones_condicoes/vento/L.png"/><br/>10 km/h</li>
<li class="ca-cond"><div class="ur">UR</div><br/>54%</li>
<li class="ca-cond"><img src="/site/imagens/icones_condicoes/chuva.png"/><br/>0.0 mm</li>
THIS IS THE CODE I DID SO FAR:
from bs4 import BeautifulSoup
import requests
URL = 'http://www.simepar.br/site/index.shtml'
rawhtml = requests.get(URL).text
soup = BeautifulSoup(rawhtml, 'lxml')
id = soup.find('a', 'id=23185109')
print(id)
any help?

from bs4 import BeautifulSoup
import requests
URL = 'http://www.simepar.br/site/index.shtml'
rawhtml = requests.get(URL).text
soup = BeautifulSoup(rawhtml, 'html.parser') # parse page as html
temp_table = soup.find_all('table', {'class':'cidadeTempo'}) # get detail of table with class name cidadeTempo
for entity in temp_table:
city_name = entity.find('h3').text # fetches name of city
city_temp_max = entity.find('span', {'class':'tempMax'}).text # fetches max temperature
city_temp_min = entity.find('span', {'class':'tempMin'}).text # fetches min temperature
print("City :{} \t Max_temp: {} \t Min_temp: {}".format(city_name, city_temp_max, city_temp_min)) # prints content
below code can get details of temprature at right side of page as you require.
result_table = soup.find('div', {'class':'ca-content-wrapper'})
print(result_table.text) # in your case there is no other div exist with class name ca-content-wrapper hence I can use it directly without iterating. you can use if condition to control which city temprature to print and which to not.
# output will be like :
# Apucarana
# 21.5°C
# 4 km/h
# UR60%
# 0.0 mm

I'm not sure what problems you are running into with your code. In my attempts to use your code, I found that I needed to use the html parser to successfully parse the website. I also used soup.findAll() in order to find elements that matched the desired class. Hopefully the below will lead you to your answer:
from bs4 import BeautifulSoup
import requests
URL = 'http://www.simepar.br/site/index.shtml'
rawhtml = requests.get(URL).text
soup = BeautifulSoup(rawhtml, 'html.parser')
rows = soup.findAll('li', {'class', 'ca-cond-firs'})
print rows

You should try out the CSS3 selectors in BS4, I personally find it a lot easier to use than find and find_all.
from bs4 import BeautifulSoup
import requests
URL = 'http://www.simepar.br/site/index.shtml'
rawhtml = requests.get(URL).text
soup = BeautifulSoup(rawhtml, 'lxml')
# soup.select returns the list of all the elements that matches the CSS3 selector
# get the text inside each <a> tag inside div.ca-cidade
cities = [cityTag.text for cityTag in soup.select("div.ca-cidade > a")]
# get the temperature inside each li.ca-cond-firs
temps = [tempTag.text for tempTag in soup.select("li.ca-cond-firs")]
# get the temperature status inside each li.ca-cond-firs > img title attibute
tempStatus = [tag["title"] for tag in soup.select("li.ca-cond-firs > img")]
# len(cities) == len(temps) == len(tempStatus) => This is normally true.
for i in range(len(cities)):
print("City: {}, Temperature: {}, Status: {}.".format(cities[i], temps[i], tempStatus[i]))

Here you go. You can customize that wind thing depending on icon name.
#!/usr/bin/env python
# -*- encoding: utf8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
from bs4 import BeautifulSoup
import requests
def get_weather_data():
URL = 'http://www.simepar.br/site/index.shtml'
rawhtml = requests.get(URL).text
soup = BeautifulSoup(rawhtml, 'html.parser')
cities = soup.find('div', {"class":"ca-content-wrapper"})
weather_data = []
for city in cities.findAll("div", {"class":"ca-bg"}):
name = city.find("div", {"class":"ca-cidade"}).text
temp = city.find("li", {"class":"ca-cond-firs"}).text
conditons = city.findAll("li", {"class":"ca-cond"})
weather_data.append({
"city":name,
"temp":temp,
"conditions":[{
"wind":conditons[0].text +" "+what_wind(conditons[0].find("img")["src"]),
"humidity":conditons[1].text,
"raind":conditons[2].text,
}]
})
return weather_data
def what_wind(img):
if img.find ("NE"):
return "From North East"
if img.find ("O"):
return "From West"
if img.find ("N"):
return "From North"
#you can add other icons here
print get_weather_data()
And that is all weather data from that website.

Related

Python web scraping script does not find element by css selector

I'm trying to get this web scraper to get current electricity price from this website, it's in finnish but it's right under "Hinta nyt". https://sahko.tk/
Here's my code:
import requests
from bs4 import BeautifulSoup
url = "https://sahko.tk/"
element_selector = ""
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
elements = soup.find_all(element_selector)
if len(elements) == 0:
print("No element found with selector '%s'" % element_selector)
else:
element_text = elements[0].text
print(element_text)
I left the element_selector to empty because what ever I tried it just did not work. I'm not even sure if I'm on the right tracks.
The data you see is embedded inside <script> in that page. To parse the current price you can use next example:
import re
import json
import requests
url = "https://sahko.tk/"
data = requests.get(url).text
data = re.search(r"function prices_today\(\)\{var t= (.*?});", data).group(1)
data = json.loads(data)
print("Hinta nyt", data["now"], "snt/kWh")
Prints:
Hinta nyt 33.27 snt/kWh

How to extract data from dropdown list using python

I am trying to extract the data from drop down using python.
link:
From that,
State
Category
District
CTSO
Division
Map Type
mentions so, I want to extract data from all by selecting one by one using beautifulsoup or request library.
In my code, I gave district name statically but I want to select it one by one and extract data from it.
I tried but it doesn't work
`
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
list = ["Akola", "Amravati", "Buldana", "yavatmal", "washim"]
url = "http://igrmaharashtra.gov.in/eASR/eASRCommon.aspx?hDistName="
for lists in list:
urls= url+lists
# print(urls)
response = requests.get(urls)
# print(response)
soup = BeautifulSoup(response.text, "html.parser")
# print(soup)
# soups= soup.find_all("div", {"id": "level_text_2"})
# print(soups)
# for ids in soup.find_all(attrs={'id': 'location_table'}):
# print(ids)
# ids = ids.text.strip()
# print(ids)
for option in soup.find_all('option'):
print(option.text)
for tag in soup.find_all(class_="panel-body"):
# print(tag.get('ctl00_ContentPlaceHolder5_ddlDistrict'))
print(tag)
`
I want:
District name
All taluka names
and all villages name. like that
Something like this
import requests
from bs4 import BeautifulSoup
for dist_name in ["Akola", "Amravati", "Buldana", "yavatmal", "washim"]:
print('-- {} --'.format(dist_name))
r = requests.get('http://igrmaharashtra.gov.in/eASR/eASRCommon.aspx?hDistName={}'.format(dist_name))
if r.status_code == 200:
soup = BeautifulSoup(r.text, "html.parser")
select_list = soup.find_all('select')
for select in select_list:
print('Select name: ' + select.attrs['name'])
option_list = select.find_all('option')
for option in option_list:
print('\t option: ' + option.attrs['value'])

HTML Parsing using bs4

I am parsing an HTMl page and am having a hard time figuring out how to pull a certain 'p' tag without a class or on id. I am trying to reach the tag of 'p' with the lat and long. Here is my current code:
import bs4
from urllib import urlopen as uReq #this opens the URL
from bs4 import BeautifulSoup as soup #parses/cuts the html
my_url = 'http://www.fortwiki.com/Battery_Adair'
print(my_url)
uClient = uReq(my_url) #opens the HTML and stores it in uClients
page_html = uClient.read() # reads the URL
uClient.close() # closes the URL
page_soup = soup(page_html, "html.parser") #parses/cuts the HTML
containers = page_soup.find_all("table")
for container in containers:
title = container.tr.p.b.text.strip()
history = container.tr.p.text.strip()
lat_long = container.tr.table
print(title)
print(history)
print(lat_long)
Link to website: http://www.fortwiki.com/Battery_Adair
The <p> tag you're looking for is very common in the document, and it doesn't have any unique attributes, so we can't select it directly.
A possible solution would be to select the tag by index, as in bloopiebloopie's answer.
However that won't work unless you know the exact position of the tag.
Another possible solution would be to find a neighbouring tag that has distinguishing attributes/text and select our tag in relation to that.
In this case we can find the previous tag with text: "Maps & Images", and use find_next to select the next tag.
import requests
from bs4 import BeautifulSoup
url = 'http://www.fortwiki.com/Battery_Adair'
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
b = soup.find('b', text='Maps & Images')
if b:
lat_long = b.find_next().text
This method should find the coordinates data in any www.fortwiki.com page that has a map.
You can use re to match partial text inside a tag.
import re
import requests
from bs4 import BeautifulSoup
url = 'http://www.fortwiki.com/Battery_Adair'
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
lat_long = soup.find('p', text=re.compile('Lat:\s\d+\.\d+\sLong:')).text
print(lat_long)
# Lat: 24.5477038 Long: -81.8104541
I am not exactly sure what you want but this works for me. There are probably neeter ways of doing it. I am new to python
soup = BeautifulSoup(requests.get("http://www.fortwiki.com/Battery_Adair").content, "html.parser")
x = soup.find("div", id="mw-content-text").find("table").find_all("p")[8]
x = x.get_text()
x = x.split("Long:")
lat = x[0].split(" ")[1]
long = x[1]
print("LAT = " + lat)
# LAT = 24.5477038
print("LNG = " + long)
# LNG = -81.8104541

Getting data within <li> tag on Beautiful Soup or Selenium

I am trying to Extract the contents from the <li> tags
Website: http://snowload.atcouncil.org/index.php/component/vcpsnowload/item
I wanted to extract contents like for different cities by entering in the address.
Query Date :
August 04, 2017
Address :
gilbert
Latitude :
33.3528264
Longitude :
-111.789027
Elevation :
0 Feet
Elevation Limitation: ASCE 7* Ground Snow Load
Elevation ≤ 2,000 feet: Ground Snow Load is 0 psf
Please find the approach which I tried to extract the contents.
import requests
from bs4 import BeautifulSoup
page = requests.get("http://snowload.atcouncil.org/index.php/component/vcpsnowload/item")
soup = BeautifulSoup(page.content,'html.parser')
div = soup.find("div",attrs={'class':'span5'})
print div.text
Problem which am facing is like its not getting extracted completely only Query Date is only extracted.
Also i tried with different parsers like 'html.parser' ,'html5lib','lxml'which gives the same result.
Please try to give some solution if it can be done with Selenium and Python.
You need to use HTTP POST method and send the location in the data e.g.
import requests
from bs4 import BeautifulSoup
import sys
data = {'optionCoordinate': '2','coordinate_address': 'gilbert'}
page = requests.post("http://snowload.atcouncil.org/index.php/component/vcpsnowload/item", data = data)
soup = BeautifulSoup(page.content,'html.parser')
div = soup.find("div",attrs={'class':'span5'})
print (div.text.encode(sys.stdout.encoding, errors='replace'))
There seems to be some characters my terminal cant print so I added
.encode(sys.stdout.encoding, errors='replace')
Updated:
from there you can get the li elements:
for li in div.find_all('li'):
print (li.text)
Updated again to write to CSV:
import requests
from bs4 import BeautifulSoup
with open('csvfile.csv','w') as csv:
for city in ['gilbert', 'tuscon', 'nogales']:
data = {'optionCoordinate': '2','coordinate_address': city}
page = requests.post("http://snowload.atcouncil.org/index.php/component/vcpsnowload/item", data = data)
soup = BeautifulSoup(page.content,'html.parser')
div = soup.find("div",attrs={'class':'span5'})
row = ""
for li in div.find_all('li'):
t = li.find(['span', 'p'])
if t is not None:
row += '"' + t.text + '",'
csv.write(row + '\n')
This code will get the text within each <li></li> you are targeting on that page.
from bs4 import BeautifulSoup as BS
from requests import get
site = "http://snowload.atcouncil.org/index.php/component/vcpsnowload/item"
req = get(site)
soup = BS(req.text, 'html.parser')
div = soup.find('ul', attrs={'class', 'map-info'})
list_items = div.find_all('li')
for li in list_items:
print(li.text)
Automated Solution to Extract the contents within li tags
from bs4 import BeautifulSoup
import urllib2
import requests
import sys
from selenium import webdriver
chrome_path = r"/usr/bin/chromedriver"
driver = webdriver.Chrome(chrome_path)
driver.get("http://snowload.atcouncil.org/")
driver.find_element_by_xpath("""//*[#id="adminForm"]/fieldset/div/div[2]/div[2]/label""").click()
driver.find_element_by_xpath("""//*[#id="coordinate_address"]""").click()
cities = ['pheonix']
for city in cities:
print (city)
driver.find_element_by_xpath('//*[#id="coordinate_address"]').send_keys(city)
driver.find_element_by_xpath('//*[#id="adminForm"]/fieldset/div/div[2]/button').click()
x = driver.current_url
#print x
Data = {'optionCoordinate': '2','coordinate_address': cities}
page = requests.post(x, data = Data)
soup = BeautifulSoup(page.content,'html.parser')
div = soup.find('div', attrs={'class': 'span5'})
for li in div.find_all('li'):
y = (li.text)
print y
driver.close()

Web Scraping with Python (city) as parameter

def findWeather(city):
import urllib
connection = urllib.urlopen("http://www.canoe.ca/Weather/World.html")
rate = connection.read()
connection.close()
currentLoc = rate.find(city)
curr = rate.find("currentDegree")
temploc = rate.find("</span>", curr)
tempstart = rate.rfind(">", 0, temploc)
print "current temp:", rate[tempstart+1:temploc]
The link is provided above. The issue I have is everytime I run the program and use, say "Brussels" in Belgium, as the parameter, i.e findWeather("Brussels"), it will always print 24c as the temperature whereas (as I am writing this) it should be 19c. This is the case for many other cities provided by the site. Help on this code would be appreciated.
Thanks!
This one should work:
import requests
from bs4 import BeautifulSoup
url = 'http://www.canoe.ca/Weather/World.html'
response = requests.get(url)
# Get the text of the contents
html_content = response.text
# Convert the html content into a beautiful soup object
soup = BeautifulSoup(html_content, 'lxml')
cities = soup.find_all("span", class_="titleText")
cels = soup.find_all("span", class_="currentDegree")
for x,y in zip(cities,cels):
print (x.text,y.text)

Categories

Resources