I've recently started to play with BeautifulSoup in order to extract specific data in HTML and convert to a pandas dataframe. I've been stuck as the data I am returning is a string and not a list.
My goal is to get all info under the results tag in the HTML report in their original data type (list of dict) so I can iterate over and return a data frame.
Here is the code:
import requests
from bs4 import BeautifulSoup
countriy_code = 'BE'
city = 'BRUSSELS'.upper()
code_postal = '1000'
max_price = '250000'
URL = f'https://www.immoweb.be/en/search/apartment/for-sale?countries={countriy_code}&districts={city}&postalCodes={code_postal}&maxPrice={max_price}&orderBy=relevance'
print(URL)
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
page_results = soup.find('iw-search')
page_attributes = page_results.attrs
for key, items in page_attributes.items():
print(type(key), type(items))
The raw HTML looks like (note it's not the full message):
<iw-search :anchor-card-id="null" :criteria='{"countries":"BE","districts":[{"queryValue":"BRUSSELS","queryParam":"districts","slug":"brussels","shortLabel":"Brussels (District)","label":"Brussels (District)","translations":{"fr":"Bruxelles","en":"Brussels","nl":"Brussel"}}],"maxPrice":250000,"postalCodes":[{"queryValue":"1000","queryParam":"postalCodes","slug":"brussels-city","shortLabel":"Brussels City (1000)","label":"Brussels City (1000)","translations":{"fr":"Bruxelles ville","en":"Brussels City","nl":"Brussel"}}],"propertyTypes":"APARTMENT","transactionTypes":"FOR_SALE"}' :geo-point-count="876" :labellized-search='"Apartment for sale - Brussels (District) (and 1 more)"' :marketing-count="1457" :page="1" :result-count="1457" :results='[{"id":9103642,"cluster":{"minPrice":null,"maxPrice":null,"minRoom":null,"maxRoom":null,"minSurface":null,"maxSurface":null,"projectInfo":null,"bedroomRange":"","surfaceRange":""},"customerLogoUrl":"https:\/\/static.immoweb.be\/logos\/145409.gif?cache=2016051503060","customerName":"Expertissimmo","flags":{"main":"under_option","secondary":[],"percentSold":null},"media":{"pictures":[{"smallUrl":"https:\/\/static.immoweb.be\/photos\/0\/9\/1\/0\/3\/6\/4\/2\/9103642_1.gif?cache=20210106041435","mediumUrl":"https:\/\/static.immoweb.be\/photos\/0\/9\/1\/0\/3\/6\/4\/2\/M_9103642_1.jpg?cache=20210106041435","largeUrl":"https:\/\/static.immoweb.be\/photos\/0\/9\/1\/0\/3\/6\/4\/2\/9103642_1.jpg?cache=20210106041435","isVertical":false},{"smallUrl":"https:\/\/static.immoweb.be\/photos\/0\/9\/1\/0\/3\/6\/4\/2\/9103642_2.gif?cache=20210106041435","mediumUrl":```
thanks for your return
You can get a list of dictionary objects using json (the string is valid json)
import json
res = json.loads(page_attributes[':results'])
Related
I wanted to try to scrape some specific columns (Company details column) in the CNBC Nasdaq 100 website specifically the Adobe stocks, below is the snippet of my code
# Importing Libraries
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd
def get_company_info(url):
original_url = url
key = {}
l = []
page_response = requests.get(url, timeout=240)
page_content = BeautifulSoup(page_response.content, "html.parser")
name = page_content.find('div',{"class":"quote-section-header large-header"}).find("span",{"class":"symbol"}).text
description = page_content.find_all('div',{"class":"moduleBox"})
for items in description:
for i in range(len(items.find_all("tr"))-1):
# Gather data
key["stock_desc"] = items.find_all("td", {"class":"desc"})[i].find('div',attrs={'id':'descLong'}).text
shares = items.find_all("td").find("table",attrs={"id":"shares"})
for rest_of_items in shares:
for i in range(len(items.find_all("tr"))-1):
key["stock_outstanding-shares"] = items.find_all("td", {"class":"bold aRit"})[i].text
key["stock_ownership"] = items.find_all("td", {"class":"bold aRit"})[i].text
key["stock_market_cap"] = items.find_all("td", {"class":"bold aRit"})[i].text
key["stock_lastSplit"] = items.find_all("td", {"class":"bold aRit"})[i].text
# Print ("")
l.append(key)
key['name'] = name
df = pd.DataFrame(l)
print(df)
return key, df
get_company_info("https://www.cnbc.com/quotes/?symbol=ADBE&tab=profile")
So, I'm keen to get the result in dataframe so that I can change to CSV file, but my code keep showing empty dataframe result, Below are the error shown
The result I wanted is something like this
The information you are looking for is not available in the url you requested. This is because the information is fetched by the page using a JavaScript. Which in turn requests a different URL which provides the data.
Example code
from bs4 import BeautifulSoup
import requests
page=requests.get("https://apps.cnbc.com/view.asp?symbol=ADBE.O&uid=stocks/summary")
soup = BeautifulSoup(page.content, 'html.parser')
Name=soup.find("h5",id="companyName").text
stock_desc= soup.find("div",id="descLong").text
table=soup.find("table",id="shares")
details=table.find_all("td", class_="bold aRit")
stock_outstanding_shares= details[0].text
stock_ownership= details[1].text
stock_market_cap= details[2].text
stock_lastSplit= details[3].text
You can create dataframe and export to csv.
I am trying to get all the graphics card details into a csv file but not able to scrape the data(doing this as a project to scrape data for learning purposes). I am new to python and html.
I am using request and beautifulsoup libraries.
import bs4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'https://www.newegg.com/Product/ProductList.aspx?Submit=ENE&DEPA=0&Order=BESTMATCH&Description=graphics+card&N=-1&isNodeId=1'
uClient = uReq(my_url)
Negg = uClient.read()
uClient.close
Complete_Graphics_New_Egg = soup(Negg,"html.parser")
Container_Main = Complete_Graphics_New_Egg.findAll("div",{"class":"item-container"})
Container_Main5 = str(Container_Main[5])
path_file='C:\\Users\\HP\\Documents\\Python\\Container_Main5.txt'
file_1 = open(path_file,'w')
file_1.write(Container_Main5)
file_1.close()
##Container_Main_details = Container_Main5.a
#div class="item-badges"
Container_5_1 = str(Container_Main[5].findAll("ul",{"class":"item-features"}))
path_file='C:\\Users\\HP\\Documents\\Python\\Container_test_5_1.txt'
file_5_1 = open(path_file,'w')
file_5_1.write(Container_5_1)
file_5_1.close()
Container_5_1.li
Container_5_2 = str(Container_Main[5].findAll("p",{"class":"item-promo"}))
path_file='C:\\Users\\HP\\Documents\\Python\\Container_test_5_2.txt'
file_5_2 = open(path_file,'w')
file_5_2.write(Container_5_2)
file_5_2.close()
##p class="item-promo"
##div class="item-info"
This should get you started. I'll break it down a bit too for you so you can modify and play while you're learning. I'm also suggesting to use Pandas, as it's a popular library for data manipulation and you'll be using in the near future if you're already not using it
I first initialize a results dataframe to store all the data you'll be parsing:
import bs4
import requests
import pandas as pd
results = pd.DataFrame()
Next, get the html form the site and pass that into BeautifulSoup:
my_url = 'https://www.newegg.com/Product/ProductList.aspx?Submit=ENE&DEPA=0&Order=BESTMATCH&Description=graphics+card&N=-1&isNodeId=1'
response = requests.get(my_url)
html = response.text
soup = bs4.BeautifulSoup(html, 'html.parser')
Then you had it find all the tags you were interested in. The only thing I added was have it iterate over each of those tags/elements it finds:
Container_Main = soup.find_all("div",{"class":"item-container"})
for container in Container_Main:
and then in each of those containers, grab the data you wanted from the item features and item promo. I store that data into a temporary dataframe (of 1 row) and then append that to my results dataframe. So after each iteration, the temp dataframe is overwritten with the new info, but the results won;t be overwritten, it'll just add on.
Lastly, use pandas to save the dataframe to csv.
results.to_csv('path/file.csv', index=False)
So full code:
import bs4
import requests
import pandas as pd
results = pd.DataFrame()
my_url = 'https://www.newegg.com/Product/ProductList.aspx?Submit=ENE&DEPA=0&Order=BESTMATCH&Description=graphics+card&N=-1&isNodeId=1'
response = requests.get(my_url)
html = response.text
soup = bs4.BeautifulSoup(html, 'html.parser')
Container_Main = soup.find_all("div",{"class":"item-container"})
for container in Container_Main:
item_features = container.find("ul",{"class":"item-features"})
# if there are no item-fetures, move on to the next container
if item_features == None:
continue
temp_df = pd.DataFrame(index=[0])
features_list = item_features.find_all('li')
for feature in features_list:
split_str = feature.text.split(':')
header = split_str[0]
data = split_str[1].strip()
temp_df[header] = data
promo = container.find_all("p",{"class":"item-promo"})[0].text
temp_df['promo'] = promo
results = results.append(temp_df, sort = False).reset_index(drop = True)
results.to_csv('path/file.csv', index=False)
I'm attempting to scrape the list of tickers for the Nasdaq 100 from the CNBC website: https://www.cnbc.com/nasdaq-100/. I am new to beautiful soup, but if there is a more straight forward way to scrape the list and save the data I am interested in any solution.
The code below does not return an error;however, it does not return any tickers either.
import bs4 as bs
import pickle # serializes any python object so that we do not have to go back to the CNBC website to get the tickers each time we want
# to use the 100 ticker symbols
import requests
def save_nasdaq_tickers():
''' We start by getting the source code for CNBC. We will use the request module for this'''
resp = requests.get('https://www.cnbc.com/nasdaq-100')
soup = bs.BeautifulSoup(resp.text,"lxml")# we use txt when the response comes from request module I think because resp.txt is text of source code.
table = soup.find('table',{'class':"data quoteTable"}) # We want all table of the class we think matches the table data we want from cnbc
tickers = [] # empty tickers list
# Next week iterate through the table.
for row in table.findAll('tr')[1:]:# we want to find all table rows except the header row which should be row 0 so 1 onward [:1]
ticker = row.findAll('td')[0].txt #td is the columns of the table 0 is the first column which I perceived to be the tickers
# We specifiy .txt because it is a soup object
tickers.append(ticker)
# Save this list of tickers using pickle and with open???
with open("Nasdaq100Tickers","wb") as f: # name the file Nasdaq100... etc
pickle.dump(tickers,f) # dumping the tickers to file f
print(tickers)
return tickers
save_nasdaq_tickers()
Just a small wrong in your code if you wonder why you got nothing in your tickers. ticker = row.findAll('td')[0].txt to ticker = row.findAll('td')[0].text. But when you desire to get full content in dynamic page, you need selenium.
def save_nasdaq_tickers():
try:
dr = webdriver.Chrome()
dr.get("https://www.cnbc.com/nasdaq-100")
text = dr.page_source
except Exception as e:
raise e
finally:
dr.close()
soup = bs.BeautifulSoup(text,"lxml")
table = soup.find('table',{'class':"data quoteTable"})
You can mimic the XHR request made and parse out the JSON containing the data you are after
import requests
import pandas as pd
import json
from pandas.io.json import json_normalize
from bs4 import BeautifulSoup
url = 'https://quote.cnbc.com/quote-html-webservice/quote.htm?partnerId=2&requestMethod=quick&exthrs=1&noform=1&fund=1&output=jsonp&symbols=AAL|AAPL|ADBE|ADI|ADP|ADSK|ALGN|ALXN|AMAT|AMGN|AMZN|ATVI|ASML|AVGO|BIDU|BIIB|BMRN|CDNS|CELG|CERN|CHKP|CHTR|CTRP|CTAS|CSCO|CTXS|CMCSA|COST|CSX|CTSH&callback=quoteHandler1'
res = requests.get(url)
soup = BeautifulSoup(res.content, "lxml")
s = soup.select('html')[0].text.strip('quoteHandler1(').strip(')')
data= json.loads(s)
data = json_normalize(data)
df = pd.DataFrame(data)
print(df[['symbol','last']])
Returns JSON as follows (sample expanded):
I'd like to scrape airbnb's listings by city (for the 5 cities listed in the code) and would like to gather information such as: price, a link to the listing, room type, # of guests, etc.
I was able to get the link, but I'm having trouble getting the price.
from bs4 import BeautifulSoup
import requests
import csv
from urllib.parse import urljoin # For joining next page url with base url
from datetime import datetime # For inserting the current date and time
start_url_nyc = "https://www.airbnb.com/s/New-York--NY--United-States"
start_url_mia = "https://www.airbnb.com/s/Miami--FL--United-States"
start_url_la = "https://www.airbnb.com/s/Los_Angeles--CA--United-States"
start_url_sf = "https://www.airbnb.com/s/San_Francisco--CA--United-States"
start_url_orl = "https://www.airbnb.com/s/Orlando--FL--United-States"
def scrape_airbnb(url):
# Set up the URL Request
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
# Iterate over search results
for search_result in soup.find_all('div', 'infoContainer_tfq3vd'):
# Parse the name and price and record the time
link_end = search_result.find('a').get('href')
link = "https://www.airbnb.com" + link_end
price = search_result.find('span', 'data-pricerate').find('data-reactid').get(int)
return (price)
print(scrape_airbnb(start_url_orl))
This is the html code:
<span data-pricerate="true" data-reactid=".91165im9kw.0.2.0.3.2.1.0.$0.$grid_0.$0/=1$=01$16085565.$=1$16085565.0.2.0.1.0.0.0.1:1">552</span>
This is your code
price = search_result.find('span', 'data-pricerate').find('data-reactid').get(int)
first:
Some attributes, like the data-* attributes in HTML 5, have names that can’t be used as the names of keyword arguments:
data_soup = BeautifulSoup('<div data-foo="value">foo!</div>')
data_soup.find_all(data-foo="value")
# SyntaxError: keyword can't be an expression
You can use these attributes in searches by putting them into a
dictionary and passing the dictionary into find_all() as the attrs
argument:
data_soup.find_all(attrs={"data-foo": "value"})
# [<div data-foo="value">foo!</div>]
than:
price = search_result.find('span', attrs={"data-pricerate":"true"})
this will return a span tag which contains price as string, just use .text
price = search_result.find('span', attrs={"data-pricerate":"true"}).text
I am trying to scrape results from the bbc sport website. I've got the scores working but when trying to add team names the program prints out none 1-0 none (for example). This is the code:
from bs4 import BeautifulSoup
import urllib.request
import csv
url = 'http://www.bbc.co.uk/sport/football/teams/derby-county/results'
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page)
for match in soup.select('table.table-stats tr.report'):
team1 = match.find('span', class_='team-home')
team2 = match.find('span', class_='team-away')
score = match.abbr
print(team1.string, score.string, team2.string)
It looks like you are searching for tags that are not there. For instance class_="team-home teams" is in the html, but class_='team-home' is not. The following code prints the first team name:
tables = soup.find_all("table", class_="table-stats")
tables[0].find("span", class_="team-home teams").text
# u' Birmingham '
Here is a possible solution which gets the home and away team names, the final score, the match date and the competition name via BeautifulSoup and puts it in a DataFrame.
import requests
import pandas as pd
from bs4 import BeautifulSoup
#Get the relevant webpage set the data up for parsing
url = "http://www.bbc.co.uk/sport/football/teams/derby-county/results"
r = requests.get(url)
soup=BeautifulSoup(r.content,"lxml")
#set up a function to parse the "soup" for each category of information and put it in a DataFrame
def get_match_info(soup,tag,class_name,column_name):
info_array=[]
for info in soup.find_all('%s'%tag,attrs={'class':'%s'%class_name}):
info_array.append({'%s'%column_name:info.text})
return pd.DataFrame(info_array)
#for each category pass the above function the relevant information i.e. tag names
date = get_match_info(soup,"td","match-date","Date")
home_team = get_match_info(soup,"span","team-home teams","Home Team")
score = get_match_info(soup,"span","score","Score")
away_team = get_match_info(soup,"span","team-away teams","Away Team")
competition = get_match_info(soup,"td","match-competition","Competition")
#Concatenate the DataFrames to present a final table of all the above info
match_info = pd.concat([date,home_team,score,away_team,competition],ignore_index=False,axis=1)
print match_info