Scraping data with BS4 - text strip() not working

Scraping data with BS4 - text strip() not working - python

I am looking to scrape some publicly available data from one of the technology/analyst research firms.
I have gotten so far that I can print out the title and position but the text.strip() function has not really worked - I am probably missing something obvious.
import requests
from bs4 import BeautifulSoup
from requests.api import head
# get the data
data = requests.get("https://www.forrester.com/bio/michele-goetz?id=BIO5224")
# load data into bs4
soup = BeautifulSoup(data.text, "html.parser")
analyst_data = soup.find("div", { "class": "col-md-9" })
#print(analyst_data)
header_title = analyst_data.find("h1")
header_paragraph = analyst_data.find("p")
print(header_title,header_paragraph)
for data in header_title.find_all(), header_paragraph.find_all():
name = data.find_all("h1")[0].text.strip()
position = data.find_all("p")[1].text.strip()
print(name , position)

You have already found a tag when doing:
header_title = analyst_data.find("h1")
header_paragraph = analyst_data.find("p")
therefore, there no point of creating this for loop:
for data in header_title.find_all(), header_paragraph.find_all():
name = data.find_all("h1")[0].text.strip()
position = data.find_all("p")[1].text.strip()
print(name , position)
instead, call .text on header_title and header_paragraph. With your example:
import requests
from bs4 import BeautifulSoup
from requests.api import head
# get the data
data = requests.get("https://www.forrester.com/bio/michele-goetz?id=BIO5224")
# load data into bs4
soup = BeautifulSoup(data.text, "html.parser")
analyst_data = soup.find("div", { "class": "col-md-9" })
#print(analyst_data)
header_title = analyst_data.find("h1")
header_paragraph = analyst_data.find("p")
print(header_title.text.strip(), header_paragraph.text.strip())
Output:
Michele Goetz VP, Principal Analyst

Related

How to not print empty line?

I'm trying to scrap some links from a site but I'm running into an issue where my for loop will stop at the first link.
Currently What I have
import requests
import lxml
from bs4 import BeautifulSoup
url = 'http://ufcstats.com/statistics/fighters?char=a'
f = requests.get(url)
soup = BeautifulSoup(f.content, 'lxml')
fighter_links = soup.find('td', {
'class': 'b-statistics__table-col'
}).find_all('a')
fighterLinks = []
for anchor in fighter_links:
# urls = anchor['href']
fighterLinks.append(anchor['href'])
print(fighterLinks)
When I print I'm getting
['http://ufcstats.com/fighter-details/93fe7332d16c6ad9']
Site I'm trying to pull from

when you do
fighter_links = soup.find('td', {'class': 'b-statistics__table-col'}).find_all('a')
you are only getting the first table record. soup.find will only return the first match that it finds. what you need to do is change it to
fighter_links = soup.find_all('td', {'class': 'b-statistics__table-col'})
fighterLinks = []
that will get you all the table enteries that match your class name, and from there you need to do loop to extract out the links
for link in fighter_links:
if(link.find('a')):
fighterLinks.append(link.find('a').get('href'))

I don't know if this will help, but I hope it does:
import requests
from bs4 import BeautifulSoup
url = 'http://ufcstats.com/statistics/fighters?char=a'
f = requests.get(url)
soup = BeautifulSoup(f.content, 'lxml')
aa = soup.select("a.b-link_style_black")
fighterLinks = []
for i in aa:
for k in i:
fighterLinks.append(aa[aa.index(i)].attrs["href"])
print(fighterLinks)
outputs:
['http://ufcstats.com/fighter-details/93fe7332d16c6ad9', 'http://ufcstats.com/fighter-details/93fe7332d16c6ad9', 'http://ufcstats.com/fighter-details/15df64c02b6b0fde', 'http://ufcstats.com/fighter-details/15df64c02b6b0fde', 'http://ufcstats.com/fighter-details/15df64c02b6b0fde', 'http://ufcstats.com/fighter-details/59a9d6dac61c2540', 'http://ufcstats.com/fighter-details/59a9d6dac61c2540', 'http://ufcstats.com/fighter-details/b361180739bed4b0', 'http://ufcstats.com/fighter-details/b361180739bed4b0', 'http://ufcstats.com/fighter-details/b361180739bed4b0', 'http://ufcstats.com/fighter-details/3329d692aea4dc28', 'http://ufcstats.com/fighter-details/3329d692aea4dc28', 'http://ufcstats.com/fighter-details/3329d692aea4dc28', 'http://ufcstats.com/fighter-details/2f5cbecbbe18bac4', 'http://ufcstats.com/fighter-details/2f5cbecbbe18bac4', 'http://ufcstats.com/fighter-details/2f5cbecbbe18bac4', 'http://ufcstats.com/fighter-details/c0ed7b208197e8de', 'http://ufcstats.com/fighter-details/c0ed7b208197e8de', 'http://ufcstats.com/fighter-details/c0ed7b208197e8de', 'http://ufcstats.com/fighter-details/5140122c3eecd307', 'http://ufcstats.com/fighter-details/5140122c3eecd307', 'http://ufcstats.com/fighter-details/c9f6385af6df66d7', 'http://ufcstats.com/fighter-details/c9f6385af6df66d7', 'http://ufcstats.com/fighter-details/c9f6385af6df66d7', 'http://ufcstats.com/fighter-details/aa6e591c2a2cdecd', 'http://ufcstats.com/fighter-details/aa6e591c2a2cdecd', 'http://ufcstats.com/fighter-details/aa6e591c2a2cdecd', 'http://ufcstats.com/fighter-details/7279654c7674cd24', 'http://ufcstats.com/fighter-details/7279654c7674cd24', 'http://ufcstats.com/fighter-details/7279654c7674cd24', 'http://ufcstats.com/fighter-details/1c5879330d42255f', 'http://ufcstats.com/fighter-details/1c5879330d42255f', 'http://ufcstats.com/fighter-details/989b85f6540c86b1', 'http://ufcstats.com/fighter-details/989b85f6540c86b1', 'http://ufcstats.com/fighter-details/989b85f6540c86b1', 'http://ufcstats.com/fighter-details/2620f3eb21c79614', 'http://ufcstats.com/fighter-details/2620f3eb21c79614', 'http://ufcstats.com/fighter-details/83b00f7597e5ac83', 'http://ufcstats.com/fighter-details/83b00f7597e5ac83', 'http://ufcstats.com/fighter-details/83b00f7597e5ac83', 'http://ufcstats.com/fighter-details/a77633a989013265', 'http://ufcstats.com/fighter-details/a77633a989013265', 'http://ufcstats.com/fighter-details/a77633a989013265', 'http://ufcstats.com/fighter-details/79cb2a690b9ba5e8', 'http://ufcstats.com/fighter-details/79cb2a690b9ba5e8', 'http://ufcstats.com/fighter-details/79cb2a690b9ba5e8', 'http://ufcstats.com/fighter-details/1338e2c7480bdf9e', 'http://ufcstats.com/fighter-details/1338e2c7480bdf9e', 'http://ufcstats.com/fighter-details/1338e2c7480bdf9e', 'http://ufcstats.com/fighter-details/0e9869d712e81f8f', 'http://ufcstats.com/fighter-details/0e9869d712e81f8f', 'http://ufcstats.com/fighter-details/0e9869d712e81f8f', 'http://ufcstats.com/fighter-details/ebc5af72ad5a28cb', 'http://ufcstats.com/fighter-details/ebc5af72ad5a28cb', 'http://ufcstats.com/fighter-details/a08ddd04eaffd81d', 'http://ufcstats.com/fighter-details/a08ddd04eaffd81d', 'http://ufcstats.com/fighter-details/a08ddd04eaffd81d', 'http://ufcstats.com/fighter-details/44aa652b181bcf68', 'http://ufcstats.com/fighter-details/44aa652b181bcf68', 'http://ufcstats.com/fighter-details/501821d7fb7b95c1', 'http://ufcstats.com/fighter-details/501821d7fb7b95c1', 'http://ufcstats.com/fighter-details/6cadc0a0ba7dc015', 'http://ufcstats.com/fighter-details/6cadc0a0ba7dc015', 'http://ufcstats.com/fighter-details/6cadc0a0ba7dc015', 'http://ufcstats.com/fighter-details/8f382b3baa954d2a', 'http://ufcstats.com/fighter-details/8f382b3baa954d2a', 'http://ufcstats.com/fighter-details/8f382b3baa954d2a']

Requests will fail on some connections in this instance. Better use cloudscraper: (pip install cloudscraper)
import cloudscraper
from bs4 import BeautifulSoup
scraper = cloudscraper.create_scraper()
soup = BeautifulSoup(scraper.get("http://ufcstats.com/statistics/fighters?char=a").text)
links = soup.select_one('.b-statistics__table').select('a')
print(set([x.get('href') for x in links]))
This returns:
{'http://ufcstats.com/fighter-details/a08ddd04eaffd81d', 'http://ufcstats.com/fighter-details/15df64c02b6b0fde', 'http://ufcstats.com/fighter-details/a77633a989013265', 'http://ufcstats.com/fighter-details/0e9869d712e81f8f', 'http://ufcstats.com/fighter-details/1338e2c7480bdf9e', 'http://ufcstats.com/fighter-details/3329d692aea4dc28', 'http://ufcstats.com/fighter-details/79cb2a690b9ba5e8', 'http://ufcstats.com/fighter-details/44aa652b181bcf68', 'http://ufcstats.com/fighter-details/c0ed7b208197e8de', 'http://ufcstats.com/fighter-details/b361180739bed4b0', 'http://ufcstats.com/fighter-details/59a9d6dac61c2540', 'http://ufcstats.com/fighter-details/ebc5af72ad5a28cb', 'http://ufcstats.com/fighter-details/501821d7fb7b95c1', 'http://ufcstats.com/fighter-details/989b85f6540c86b1', 'http://ufcstats.com/fighter-details/7279654c7674cd24', 'http://ufcstats.com/fighter-details/aa6e591c2a2cdecd', 'http://ufcstats.com/fighter-details/8f382b3baa954d2a', 'http://ufcstats.com/fighter-details/2f5cbecbbe18bac4', 'http://ufcstats.com/fighter-details/93fe7332d16c6ad9', 'http://ufcstats.com/fighter-details/c9f6385af6df66d7', 'http://ufcstats.com/fighter-details/2620f3eb21c79614', 'http://ufcstats.com/fighter-details/5140122c3eecd307', 'http://ufcstats.com/fighter-details/83b00f7597e5ac83', 'http://ufcstats.com/fighter-details/6cadc0a0ba7dc015', 'http://ufcstats.com/fighter-details/1c5879330d42255f'}

Web Scraping a table with BeautifulSoup

I am trying to fetch the calendar of events of current month and store it into the file in the same format, such as, date, time, event and location) from my school's website https://www.umkc.edu/calendar/
I don't have much experience in HTML so I don't really know what I am dealing with, that's why I used iframe, but I don't even know if it's the right thing.
Can you help me with getting the table that has the events?
That's what I have so far:
import requests
from bs4 import BeautifulSoup
html = requests.get("https://www.umkc.edu/calendar/")
soup = BeautifulSoup(html.content, "html.parser")
print(soup.find_all("iframe"))

So, data is dynamically added. You could use the same request as is performed for iframe content and use regex (or split) to get out the required javascript to then parse with hjson (due to unquoted keys); extract the html content and parse with bs4. The alternative, shown at bottom is to investigate the rss feed.
from bs4 import BeautifulSoup as bs
import requests, re, hjson
import pandas as pd
r = requests.get('https://www.trumba.com/s.aspx?calendar=UMKC&widget=main&srpc.cbid=trumba.spud.4&srpc.get=true')
p = re.compile(r'requestComplete\((.*)\);', re.DOTALL)
data = hjson.loads(re.sub('\r+|\n+|\t+','', p.findall(r.text)[0].strip()))
soup = bs(data['body'],'lxml')
p2 = re.compile(r"'(.*?)'", re.DOTALL)
results = []
for tr in soup.select('.twSimpleTableTable tr')[2:]:
date = tr.select_one('.twStartDate').text
time = tr.select_one('td:has(.twStartDate) + td, .twStartTime').text
event = p2.findall(tr.select_one('.txVMid')['title'])[0].strip()
location = tr.select_one('.twLocation')
if location is None:
location = ''
else:
location = ' '.join([string for string in location.stripped_strings])
row = [date, time, event, location]
results.append(row)
df = pd.DataFrame(results)
print(df)
Or (this needs further investigation and work):
import requests, re
import pandas as pd
from bs4 import BeautifulSoup as bs
r = requests.get('https://www.trumba.com/calendars/UMKC.rss')
soup = bs(r.content, 'lxml')
df = pd.DataFrame(zip([i.text for i in soup.select('title')]
,[i.text for i in soup.select('link')]
,[re.sub('<br />|<br/>','',i.text) for i in soup.select('description')]))
print(df)

Data Scrape from a website to a csv file format using python and beautifulsoup

I am trying to get all the graphics card details into a csv file but not able to scrape the data(doing this as a project to scrape data for learning purposes). I am new to python and html.
I am using request and beautifulsoup libraries.
import bs4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'https://www.newegg.com/Product/ProductList.aspx?Submit=ENE&DEPA=0&Order=BESTMATCH&Description=graphics+card&N=-1&isNodeId=1'
uClient = uReq(my_url)
Negg = uClient.read()
uClient.close
Complete_Graphics_New_Egg = soup(Negg,"html.parser")
Container_Main = Complete_Graphics_New_Egg.findAll("div",{"class":"item-container"})
Container_Main5 = str(Container_Main[5])
path_file='C:\\Users\\HP\\Documents\\Python\\Container_Main5.txt'
file_1 = open(path_file,'w')
file_1.write(Container_Main5)
file_1.close()
##Container_Main_details = Container_Main5.a
#div class="item-badges"
Container_5_1 = str(Container_Main[5].findAll("ul",{"class":"item-features"}))
path_file='C:\\Users\\HP\\Documents\\Python\\Container_test_5_1.txt'
file_5_1 = open(path_file,'w')
file_5_1.write(Container_5_1)
file_5_1.close()
Container_5_1.li
Container_5_2 = str(Container_Main[5].findAll("p",{"class":"item-promo"}))
path_file='C:\\Users\\HP\\Documents\\Python\\Container_test_5_2.txt'
file_5_2 = open(path_file,'w')
file_5_2.write(Container_5_2)
file_5_2.close()
##p class="item-promo"
##div class="item-info"

This should get you started. I'll break it down a bit too for you so you can modify and play while you're learning. I'm also suggesting to use Pandas, as it's a popular library for data manipulation and you'll be using in the near future if you're already not using it
I first initialize a results dataframe to store all the data you'll be parsing:
import bs4
import requests
import pandas as pd
results = pd.DataFrame()
Next, get the html form the site and pass that into BeautifulSoup:
my_url = 'https://www.newegg.com/Product/ProductList.aspx?Submit=ENE&DEPA=0&Order=BESTMATCH&Description=graphics+card&N=-1&isNodeId=1'
response = requests.get(my_url)
html = response.text
soup = bs4.BeautifulSoup(html, 'html.parser')
Then you had it find all the tags you were interested in. The only thing I added was have it iterate over each of those tags/elements it finds:
Container_Main = soup.find_all("div",{"class":"item-container"})
for container in Container_Main:
and then in each of those containers, grab the data you wanted from the item features and item promo. I store that data into a temporary dataframe (of 1 row) and then append that to my results dataframe. So after each iteration, the temp dataframe is overwritten with the new info, but the results won;t be overwritten, it'll just add on.
Lastly, use pandas to save the dataframe to csv.
results.to_csv('path/file.csv', index=False)
So full code:
import bs4
import requests
import pandas as pd
results = pd.DataFrame()
my_url = 'https://www.newegg.com/Product/ProductList.aspx?Submit=ENE&DEPA=0&Order=BESTMATCH&Description=graphics+card&N=-1&isNodeId=1'
response = requests.get(my_url)
html = response.text
soup = bs4.BeautifulSoup(html, 'html.parser')
Container_Main = soup.find_all("div",{"class":"item-container"})
for container in Container_Main:
item_features = container.find("ul",{"class":"item-features"})
# if there are no item-fetures, move on to the next container
if item_features == None:
continue
temp_df = pd.DataFrame(index=[0])
features_list = item_features.find_all('li')
for feature in features_list:
split_str = feature.text.split(':')
header = split_str[0]
data = split_str[1].strip()
temp_df[header] = data
promo = container.find_all("p",{"class":"item-promo"})[0].text
temp_df['promo'] = promo
results = results.append(temp_df, sort = False).reset_index(drop = True)
results.to_csv('path/file.csv', index=False)

Getting data within <li> tag on Beautiful Soup or Selenium

I am trying to Extract the contents from the <li> tags
Website: http://snowload.atcouncil.org/index.php/component/vcpsnowload/item
I wanted to extract contents like for different cities by entering in the address.
Query Date :
August 04, 2017
Address :
gilbert
Latitude :
33.3528264
Longitude :
-111.789027
Elevation :
0 Feet
Elevation Limitation: ASCE 7* Ground Snow Load
Elevation ≤ 2,000 feet: Ground Snow Load is 0 psf
Please find the approach which I tried to extract the contents.
import requests
from bs4 import BeautifulSoup
page = requests.get("http://snowload.atcouncil.org/index.php/component/vcpsnowload/item")
soup = BeautifulSoup(page.content,'html.parser')
div = soup.find("div",attrs={'class':'span5'})
print div.text
Problem which am facing is like its not getting extracted completely only Query Date is only extracted.
Also i tried with different parsers like 'html.parser' ,'html5lib','lxml'which gives the same result.
Please try to give some solution if it can be done with Selenium and Python.

You need to use HTTP POST method and send the location in the data e.g.
import requests
from bs4 import BeautifulSoup
import sys
data = {'optionCoordinate': '2','coordinate_address': 'gilbert'}
page = requests.post("http://snowload.atcouncil.org/index.php/component/vcpsnowload/item", data = data)
soup = BeautifulSoup(page.content,'html.parser')
div = soup.find("div",attrs={'class':'span5'})
print (div.text.encode(sys.stdout.encoding, errors='replace'))
There seems to be some characters my terminal cant print so I added
.encode(sys.stdout.encoding, errors='replace')
Updated:
from there you can get the li elements:
for li in div.find_all('li'):
print (li.text)
Updated again to write to CSV:
import requests
from bs4 import BeautifulSoup
with open('csvfile.csv','w') as csv:
for city in ['gilbert', 'tuscon', 'nogales']:
data = {'optionCoordinate': '2','coordinate_address': city}
page = requests.post("http://snowload.atcouncil.org/index.php/component/vcpsnowload/item", data = data)
soup = BeautifulSoup(page.content,'html.parser')
div = soup.find("div",attrs={'class':'span5'})
row = ""
for li in div.find_all('li'):
t = li.find(['span', 'p'])
if t is not None:
row += '"' + t.text + '",'
csv.write(row + '\n')

This code will get the text within each <li></li> you are targeting on that page.
from bs4 import BeautifulSoup as BS
from requests import get
site = "http://snowload.atcouncil.org/index.php/component/vcpsnowload/item"
req = get(site)
soup = BS(req.text, 'html.parser')
div = soup.find('ul', attrs={'class', 'map-info'})
list_items = div.find_all('li')
for li in list_items:
print(li.text)

Automated Solution to Extract the contents within li tags
from bs4 import BeautifulSoup
import urllib2
import requests
import sys
from selenium import webdriver
chrome_path = r"/usr/bin/chromedriver"
driver = webdriver.Chrome(chrome_path)
driver.get("http://snowload.atcouncil.org/")
driver.find_element_by_xpath("""//*[#id="adminForm"]/fieldset/div/div[2]/div[2]/label""").click()
driver.find_element_by_xpath("""//*[#id="coordinate_address"]""").click()
cities = ['pheonix']
for city in cities:
print (city)
driver.find_element_by_xpath('//*[#id="coordinate_address"]').send_keys(city)
driver.find_element_by_xpath('//*[#id="adminForm"]/fieldset/div/div[2]/button').click()
x = driver.current_url
#print x
Data = {'optionCoordinate': '2','coordinate_address': cities}
page = requests.post(x, data = Data)
soup = BeautifulSoup(page.content,'html.parser')
div = soup.find('div', attrs={'class': 'span5'})
for li in div.find_all('li'):
y = (li.text)
print y
driver.close()

Python /bs4: trying to print temperature/city from a local website

I'm trying to get and print the current weather temperature and city name from a local website, but no success.
All I need it to read and print the city (Lodrina), the Temperature (23.1C) and if possible the title in ca-cond-firs ("Temperatura em declínio") - this last one changes as temps goes up or down...
This is the html section of the site:
THIS IS THE HTML (the part of matters:)
#<div class="ca-cidade">Londrina</div>
<ul class="ca-condicoes">
<li class="ca-cond-firs"><img src="/site/imagens/icones_condicoes/temperatura/temp_baixa.png" title="Temperatura em declínio"/><br/>23.1°C</li>
<li class="ca-cond"><img src="/site/imagens/icones_condicoes/vento/L.png"/><br/>10 km/h</li>
<li class="ca-cond"><div class="ur">UR</div><br/>54%</li>
<li class="ca-cond"><img src="/site/imagens/icones_condicoes/chuva.png"/><br/>0.0 mm</li>
THIS IS THE CODE I DID SO FAR:
from bs4 import BeautifulSoup
import requests
URL = 'http://www.simepar.br/site/index.shtml'
rawhtml = requests.get(URL).text
soup = BeautifulSoup(rawhtml, 'lxml')
id = soup.find('a', 'id=23185109')
print(id)
any help?

from bs4 import BeautifulSoup
import requests
URL = 'http://www.simepar.br/site/index.shtml'
rawhtml = requests.get(URL).text
soup = BeautifulSoup(rawhtml, 'html.parser') # parse page as html
temp_table = soup.find_all('table', {'class':'cidadeTempo'}) # get detail of table with class name cidadeTempo
for entity in temp_table:
city_name = entity.find('h3').text # fetches name of city
city_temp_max = entity.find('span', {'class':'tempMax'}).text # fetches max temperature
city_temp_min = entity.find('span', {'class':'tempMin'}).text # fetches min temperature
print("City :{} \t Max_temp: {} \t Min_temp: {}".format(city_name, city_temp_max, city_temp_min)) # prints content
below code can get details of temprature at right side of page as you require.
result_table = soup.find('div', {'class':'ca-content-wrapper'})
print(result_table.text) # in your case there is no other div exist with class name ca-content-wrapper hence I can use it directly without iterating. you can use if condition to control which city temprature to print and which to not.
# output will be like :
# Apucarana
# 21.5°C
# 4 km/h
# UR60%
# 0.0 mm

I'm not sure what problems you are running into with your code. In my attempts to use your code, I found that I needed to use the html parser to successfully parse the website. I also used soup.findAll() in order to find elements that matched the desired class. Hopefully the below will lead you to your answer:
from bs4 import BeautifulSoup
import requests
URL = 'http://www.simepar.br/site/index.shtml'
rawhtml = requests.get(URL).text
soup = BeautifulSoup(rawhtml, 'html.parser')
rows = soup.findAll('li', {'class', 'ca-cond-firs'})
print rows

You should try out the CSS3 selectors in BS4, I personally find it a lot easier to use than find and find_all.
from bs4 import BeautifulSoup
import requests
URL = 'http://www.simepar.br/site/index.shtml'
rawhtml = requests.get(URL).text
soup = BeautifulSoup(rawhtml, 'lxml')
# soup.select returns the list of all the elements that matches the CSS3 selector
# get the text inside each <a> tag inside div.ca-cidade
cities = [cityTag.text for cityTag in soup.select("div.ca-cidade > a")]
# get the temperature inside each li.ca-cond-firs
temps = [tempTag.text for tempTag in soup.select("li.ca-cond-firs")]
# get the temperature status inside each li.ca-cond-firs > img title attibute
tempStatus = [tag["title"] for tag in soup.select("li.ca-cond-firs > img")]
# len(cities) == len(temps) == len(tempStatus) => This is normally true.
for i in range(len(cities)):
print("City: {}, Temperature: {}, Status: {}.".format(cities[i], temps[i], tempStatus[i]))

Here you go. You can customize that wind thing depending on icon name.
#!/usr/bin/env python
# -*- encoding: utf8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
from bs4 import BeautifulSoup
import requests
def get_weather_data():
URL = 'http://www.simepar.br/site/index.shtml'
rawhtml = requests.get(URL).text
soup = BeautifulSoup(rawhtml, 'html.parser')
cities = soup.find('div', {"class":"ca-content-wrapper"})
weather_data = []
for city in cities.findAll("div", {"class":"ca-bg"}):
name = city.find("div", {"class":"ca-cidade"}).text
temp = city.find("li", {"class":"ca-cond-firs"}).text
conditons = city.findAll("li", {"class":"ca-cond"})
weather_data.append({
"city":name,
"temp":temp,
"conditions":[{
"wind":conditons[0].text +" "+what_wind(conditons[0].find("img")["src"]),
"humidity":conditons[1].text,
"raind":conditons[2].text,
}]
})
return weather_data
def what_wind(img):
if img.find ("NE"):
return "From North East"
if img.find ("O"):
return "From West"
if img.find ("N"):
return "From North"
#you can add other icons here
print get_weather_data()
And that is all weather data from that website.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scraping data with BS4 - text strip() not working - python

Related

How to not print empty line?

Web Scraping a table with BeautifulSoup

Data Scrape from a website to a csv file format using python and beautifulsoup

Getting data within <li> tag on Beautiful Soup or Selenium

Python /bs4: trying to print temperature/city from a local website

Categories

Resources