Handling HTML in python

Handling HTML in python - python

I had a problem when I took out html files and imported them into excel.
This is the site i need to get information: https://www.kylc.com/stats/global/yearly_per_country/g_gdp/vnm.html
As you can see, in the GDP table I have a row named : 年份 separated from 2 lines
That's why after i exported the excel file it gave unexpected results
The result I want is that the first line in excel will only have : 年份 , GDP(美元), 占世界%
Sorry for my confusing explanation, I really don't know how to explain it in detail.
Here is my python code
import requests
from bs4 import BeautifulSoup
import lxml
import csv
def get_html(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
r = "fail"
return r
def getGDP(ulist,html):
soup = BeautifulSoup(html,"html.parser")
trs = soup.find_all('tr')
for tr in trs:
list = []
for th in tr:
ts = th.string
if ts == '\n':
continue
list.append(ts)
ulist.append(list)
def saveGDP(ulist):
file_name = '21095010 胡碧玉 GDP.csv'
with open(file_name,'w',errors='ignore',newline='') as f:
f_csv = csv.writer(f)
f_csv.writerows(ulist)
def main():
unifo=[]
url='https://www.kylc.com/stats/global/yearly_per_country/g_gdp/vnm.html'
html=get_html(url)
getGDP(unifo,html)
saveGDP(unifo)
if __name__=="__main__":
main()
Thank you so much!

Using pandas scraping tables and cleaning of results in most cases is mutch easier - under the hood beautifulsoup is working for you.
In this case read_html() the table, drop the unwanted header level and filter out the rows containings ads:
import pandas as pd
df = pd.read_html('https://www.kylc.com/stats/global/yearly_per_country/g_gdp/vnm.html')[0].droplevel(0, axis=1)
df[~df.iloc[:,0].str.contains('ads')].to_csv('21095010 胡碧玉 GDP.csv', index=False)
Answering your question
You have to select your elements more specific e.g. with css selectors.
So first get the thead information from all th witout colspan, than collect the data from all tr in tbody that do not contains ads:
def getGDP(html):
soup = BeautifulSoup(html,"html.parser")
data = []
data.append([th.text for th in soup.select('thead th:not([colspan])')])
for row in soup.select('tbody tr:not(:-soup-contains("ads"))'):
data.append(list(row.stripped_strings))
return data
Example
import requests
from bs4 import BeautifulSoup
import lxml
import csv
def get_html(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
r = "fail"
return r
def getGDP(html):
soup = BeautifulSoup(html,"html.parser")
data = []
data.append([th.text for th in soup.select('thead th:not([colspan])')])
for x in soup.select('tbody tr:not(:-soup-contains("ads"))'):
data.append(list(x.stripped_strings))
return data
def saveGDP(ulist):
file_name = '21095010 胡碧玉 GDP.csv'
print(ulist)
with open(file_name,'w',errors='ignore', encoding='utf-8') as f:
f_csv = csv.writer(f)
f_csv.writerows(ulist)
def main():
url='https://www.kylc.com/stats/global/yearly_per_country/g_gdp/vnm.html'
html=get_html(url)
saveGDP(getGDP(html))
if __name__=="__main__":
main()

Related

CSV file being exported empty and only the headers are showing?

So Im learning more about python everyday. Im doing a mini web scrape project and at the very end when I should see the results on an exported csv - it comes up blank except for the headers. Any help is gladly appreciated! Thanks.
The code is below:
import csv
import requests
from bs4 import BeautifulSoup
url = "https://www.boxofficemojo.com/year/"
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html, "html.parser")
box_office_table = soup.find("div", class_="a-section mojo-body aok-relative").find_all("tr")
with open('imdbmovies.csv', 'a', newline='') as csvfile:
writer = csv.writer(csvfile)
# Write headers to CSV file
writer.writerow(['numone_release', 'year', 'total_gross', 'releases', 'average', 'gross_change'])
for row in box_office_table:
try:
year_cell = row.find("td", class_="a-text-left mojo-header-column mojo-field-type-year mojo-sort-column")
money_cells = row.find_all("td", class_="a-text-right mojo-field-type-money")
releases_cell = row.find("td", class_="a-text-right mojo-field-type-positive_integer")
gross_change_cell = row.find("td", class_="a-text-right mojo-number-delta mojo-field-type-percent_delta")
numone_release_cell = row.find("td", class_="a-text-left mojo-field-type-release mojo-cell-wide")
if len(money_cells) >= 2 and year_cell is not None and releases_cell is not None and gross_change_cell is not None and numone_release_cell is not None:
total_gross_cell = money_cells[0]
average_cell = money_cells[1]
year = year_cell.text.strip()
total_gross = total_gross_cell.text.strip()
releases = releases_cell.text.strip()
average = average_cell.text.strip()
gross_change = gross_change_cell.text.strip()
numone_release = numone_release_cell.text.strip()
print(year, total_gross, releases, average, gross_change, numone_release)
# Write the row to the CSV file
writer.writerow([numone_release, year, total_gross, releases, average, gross_change])
except AttributeError:
# Either a cell is not found
pass

Adding Data from Beautiful Soup table to a list

Hello I'm a beginner to python and programming in general, and I was wondering how I would make the outputted data a list. I used bs to extract data from a table and attempt to make a list with the data, but I end up only adding the first number to the list. Can someone provide me assistance and an explaination?
from bs4 import BeautifulSoup
from requests_html import HTMLSession
s = HTMLSession()
url = 'https://www.timeanddate.com/weather/usa/new-york/ext'
def get_data(url):
r = s.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
return soup
with open('document.txt', 'a') as f:
f.write(str(get_data(url)))
with open('document.txt', 'r') as html_file:
contents = html_file.read()
soup = BeautifulSoup(contents, 'lxml')
forecast_table = soup.find('table', class_ = 'zebra tb-wt fw va-m tb-hover')
wtitle = soup.title.text
print(wtitle)
print("------")
def get_weather_high(forecast_table):
print("Weather Highs:")
for high in forecast_table.find_all('tbody'):
rows1 = high.find_all('tr')
for row1 in rows1:
pl_high = row1.find_all('td')
pl_high = [td.text.strip() for td in pl_high]
pl_high = pl_high[1:2]
for pl_high_final in pl_high:
pl_high_final = pl_high_final[0:3]
print(pl_high_final)
get_weather_high(forecast_table)
This the output. Instead of each line being a number, I want to have it all under on list

Create a list before your for loop and just append your data instead of printing it and then just print the list after the for loop
data = []
def get_weather_high(forecast_table):
print("Weather Highs:")
for high in forecast_table.find_all('tbody'):
rows1 = high.find_all('tr')
for row1 in rows1:
pl_high = row1.find_all('td')
pl_high = [td.text.strip() for td in pl_high]
pl_high = pl_high[1:2]
for pl_high_final in pl_high:
pl_high_final = pl_high_final[0:3]
data.append(pl_high_final)
print(data) # or return data if you need it some where else

I'm trying to deduplicate weblinks scraped using Python & BeautifulSoup but it's not working

I'm trying to scrape a website in Python, I got the links to print but in trying to make them a set to deduplicate, there are still duplicates. Anyone have any advice on what I am doing wrong? Thanks in advance!
Edit: So I tried what John suggested but my csv output is a cascading list of links across the excel sheet, it's crazy...I'll post the changes below this original code:
import requests
from bs4 import BeautifulSoup
page = "https://www.census.gov/programs-surveys/popest.html"
r = requests.get(page)
raw_html = r.text
soup = BeautifulSoup(raw_html, 'html.parser')
mylink = soup.find_all('a')
print ('The number of links to start with are: ', len(mylink) )
#output = The number of links to start with are: 254
import csv
with open('census_links.csv', 'w', newline='') as f:
weblinks = str(mylink)
writer = csv.writer(f, delimiter = ' ', lineterminator = '\r')
for link in mylink:
hrefs = str(link.get('href'))
if hrefs.startswith("None"):
''
elif hrefs.startswith('http'):
MySet = set()
MySet.add(hrefs)
elif hrefs.startswith('#'):
''
elif hrefs.startswith(' '):
''
print(set(MySet))
file.write(str(MySet)+'\n')
file.close
#Edited code:
import requests
from bs4 import BeautifulSoup
page = "https://www.census.gov/programs-surveys/popest.html"
r = requests.get(page)
raw_html = r.text
soup = BeautifulSoup(raw_html, 'html.parser')
mylink = soup.find_all('a')
print ('The number of links to start with are: ', len(mylink))
# The number of links to start with are: 254
import csv
with open('census_links.csv', 'w', newline='') as f:
weblinks = str(mylink)
writer = csv.writer(f, delimiter = ',', lineterminator = '\r')
MySet = set()
for link in mylink:
hrefs = str(link.get('href'))
if hrefs.startswith("None"):
continue
elif hrefs.startswith('#'):
continue
elif hrefs.startswith(' '):
continue
elif hrefs.startswith('http'):
MySet.add(hrefs)
file.write(str(MySet)+'\n')
file.close
print(str(MySet) +'\n')

to get unique links, you want to check if the link is in MySet with hrefs not in MySet.
for simple operation you don't need csv, to write in single row
"\n".join(MySet)
and to write single column
",".join(MySet)
MySet = set()
for link in mylink:
hrefs = link.get('href')
if not hrefs or hrefs.startswith('#'):
continue
# normalize link
if hrefs.startswith('/'):
hrefs = 'https://www.census.gov' + hrefs
# check if link already in MySet
if hrefs not in MySet:
MySet.add(hrefs)
with open('census_links.csv', 'w', newline='') as f:
f.write("\n".join(MySet))
print("\n".join(MySet))

Initialize the set before the loop, and wait to print it until after the loop is done.
MySet = set()
...
for link in mylink:
hrefs = str(link.get('href'))
...
if hrefs.startswith('http'):
MySet.add(hrefs)
...
print(MySet)

same code part to get content.
import requests
from bs4 import BeautifulSoup
page = "https://www.census.gov/programs-surveys/popest.html"
r = requests.get(page)
raw_html = r.text
soup = BeautifulSoup(raw_html, 'html.parser')
mylink = soup.find_all('a')
print ('The number of links to start with are: ', len(mylink) )
#output = The number of links to start with are: 254
use pandas to get the unique url which starts with http.
import pandas as pd
obj = pd.Series(mylink)
obj_link = obj.map(lambda x: x.get('href')).drop_duplicates().dropna()
cond = obj_link.str.startswith('http')
dfn = obj_link.loc[cond].to_frame()
dfn.shape # (93, 1)
dfn.to_csv('census_links.csv', index=False, header=False)

How to loop through BS4 data and print div tag correctly

I am trying to copy all the data within an HTML page that has the certain class "chapter_header_styling" with BS4.
This was working when I manually inputed the URL – but is tedious when there are multiple books and various chapters. So I then created another script that would generate all the chapter URLs for the book and combine them into a text file bchap.txt (book chapters).
Since then I have altered the file and added various break points so ignore my lack of comments and unused arrays/list. I have narrowed it down to the ###Comment## where it doesn't work. It's probably not nested right but I'm not sure... I had this working to a point but can't figure out why it won't paste the mydivs data into the book.html file. If anyone with more experience could point me in the right direction much would be appreciated.
#mkbook.py
# coding: utf-8
from bs4 import BeautifulSoup
import requests
LINK = "https://codes.iccsafe.org/content/FAC2017"
pop = ""
#z = ""
chapters = open("bchap.txt",'r')
a = []
for aline in chapters:
chap = aline
#print (chap)
#pop = ""
pop = LINK+chap
#print (pop)
r = requests.get(pop)
data = r.text
#print(data)
soup = BeautifulSoup(data, 'html.parser')
mydivs = soup.findAll("div", {"class": ["annotator", "chapter_header_styling"]})
f = open("BOOK.html","a")
f.write("test <br/>")
########################################
#MY PROBLEM IS BELOW NOT PRINTING DIV DATA INTO TXT FILE
########################################
for div in mydivs:
print (div)
z = str(div)
print(z) #doesn't printout...why???
f.write(z)
print len(mydivs)
f.close()
chapters.close()
##############################################
## this is the old mkbook.py code before I looped it - inputing url 1 # time
#
# coding: utf-8
from bs4 import BeautifulSoup
import requests
r = requests.get("https://codes.iccsafe.org/content/FAC2017/preface")
data = r.text
soup = BeautifulSoup(data, 'html.parser')
a = []
mydivs = soup.findAll("div",{"class":["annotator",
"chapter_header_styling"]})
f = open("BOOK.html","a")
for div in mydivs:
z = str(div)
f.write(z)
f.close()
print len(mydivs) #outputs 1 if copied div data.
#######################################
#mkchap.py
# coding: utf-8
from bs4 import BeautifulSoup
import requests
r = requests.get("https://codes.iccsafe.org/content/FAC2017")
data = r.text
soup = BeautifulSoup(data, 'html.parser')
a = []
soup.findAll('option',{"value":True})
list = soup.findAll('option')
with open('bchap.txt', 'w') as filehandle:
for l in list:
filehandle.write(l['value'])
filehandle.write("\n")
print l['value']
#with open('bchap.txt', 'w') as filehandle:
# filehandle.write("%s\n" % list)
filehandle.close()

The problem seems to be that you are constructing your url using a wrong base url.
LINK = "https://codes.iccsafe.org/content/FAC2017"
If you take a look at your 1st request you can see this clearly.
print(pop)
print(r.status_code)
Outputs:
https://codes.iccsafe.org/content/FAC2017/content/FAC2017
404
After running the code to populate bchap.txt, its output is
/content/FAC2017
/content/FAC2017/legend
/content/FAC2017/copyright
/content/FAC2017/preface
/content/FAC2017/chapter-1-application-and-administration
/content/FAC2017/chapter-2-scoping-requirements
/content/FAC2017/chapter-3-building-blocks
/content/FAC2017/chapter-4-accessible-routes
/content/FAC2017/chapter-5-general-site-and-building-elements
/content/FAC2017/chapter-6-plumbing-elements-and-facilities
/content/FAC2017/chapter-7-communication-elements-and-features
/content/FAC2017/chapter-8-special-rooms-spaces-and-elements
/content/FAC2017/chapter-9-built-in-elements
/content/FAC2017/chapter-10-recreation-facilities
/content/FAC2017/list-of-figures
/content/FAC2017/fair-housing-accessibility-guidelines-design-guidelines-for-accessible-adaptable-dwellings
/content/FAC2017/advisory
Lets change the base url first and try again.
from bs4 import BeautifulSoup
import requests
LINK = "https://codes.iccsafe.org"
pop = ""
chapters = open("bchap.txt",'r')
a = []
for aline in chapters:
chap = aline
pop = LINK+chap
r = requests.get(pop)
print(pop)
print(r.status_code)
chapters.close()
Outputs:
https://codes.iccsafe.org/content/FAC2017
404
...
why? b'coz of the \n. If we do a
print(repr(pop))
It will output
'https://codes.iccsafe.org/content/FAC2017\n'
You'll have to strip away that \n also. The final code that worked is
from bs4 import BeautifulSoup
import requests
LINK = "https://codes.iccsafe.org"
pop = ""
chapters = open("bchap.txt",'r')
a = []
for aline in chapters:
chap = aline
pop = LINK+chap
r = requests.get(pop.strip())
data = r.text
soup = BeautifulSoup(data, 'html.parser')
mydivs = soup.findAll("div", class_="annotator chapter_header_styling")
f = open("BOOK.html","a")
for div in mydivs:
z = str(div)
f.write(z)
f.close()
chapters.close()

BeautifulSoup Scraping Formatting

This is my first time using BeautifulSoup and I am attempting to scrap store location data from a local convenience store.
However I'm running into some issues on trying to remove empty lines when data is being passed into a CSV file, I've tried .replace('\n','') and .strip() both did not worked.
Also I'm having problems with splitting data that is scraped and contained in the same sibling method.
I've added the script below:
from bs4 import BeautifulSoup
from requests import get
import urllib.request
import sched, time
import csv
url = 'http://www.cheers.com.sg/web/store_location.jsp'
response = get(url)
soup = BeautifulSoup(response.text, 'html.parser')
#print (soup.prettify())
#open a file for writing
location_data = open('data/soupdata.csv', 'w', newline='')
#create the csv writer object
csvwriter = csv.writer(location_data)
cheers = soup.find('div' , id="store_container")
count = 0
#Loop for Header tags
for paragraph in cheers.find_all('b'):
header1 = paragraph.text.replace(':' , '')
header2 = paragraph.find_next('b').text.replace(':' , '')
header3 = paragraph.find_next_siblings('b')[1].text.replace(':' , '')
if count == 0:
csvwriter.writerow([header1, header2, header3])
count += 1
break
for paragraph in cheers.find_all('br'):
brnext = paragraph.next_sibling.strip()
brnext1 = paragraph.next_sibling
test1 = brnext1.next_sibling.next_sibling
print(test1)
csvwriter.writerow([brnext, test1])
location_data.close()
Sample of output generated:
Sample of what output should look like:
How can I achieve this?
Thanks in advance.

To make it slightly organized, you can try like the following. I've used .select() instead of .find_all().
import csv
from bs4 import BeautifulSoup
import requests
url = 'http://www.cheers.com.sg/web/store_location.jsp'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
with open("output.csv","w",newline="") as infile:
writer = csv.writer(infile)
writer.writerow(["Address","Telephone","Store hours"])
for items in soup.select("#store_container .store_col"):
addr = items.select_one("b").next_sibling.next_sibling
tel = items.select_one("b:nth-of-type(2)").next_sibling
store = items.select_one("b:nth-of-type(3)").next_sibling
writer.writerow([addr,tel,store])

You just need to change the way of extracting address, tel and store hours
import csv
from bs4 import BeautifulSoup
from requests import get
url = 'http://www.cheers.com.sg/web/store_location.jsp'
response = get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# print (soup.prettify())
# open a file for writing
location_data = open('data/soupdata.csv', 'w', newline='')
# create the csv writer object
csvwriter = csv.writer(location_data)
cheers = soup.find('div', id="store_container")
count = 0
# Loop for Header tags
for paragraph in cheers.find_all('b'):
header1 = paragraph.text.replace(':', '')
header2 = paragraph.find_next('b').text.replace(':', '')
header3 = paragraph.find_next_siblings('b')[1].text.replace(':', '')
if count == 0:
csvwriter.writerow([header1, header2, header3])
count += 1
break
for paragraph in cheers.find_all('div'):
label = paragraph.find_all('b')
if len(label) == 3:
print(label)
address = label[0].next_sibling.next_sibling
tel = label[1].next_sibling
hours = label[2].next_sibling
csvwriter.writerow([address, tel, hours])
location_data.close()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Handling HTML in python - python

Related

CSV file being exported empty and only the headers are showing?

Adding Data from Beautiful Soup table to a list

I'm trying to deduplicate weblinks scraped using Python & BeautifulSoup but it's not working

How to loop through BS4 data and print div tag correctly

BeautifulSoup Scraping Formatting

Categories

Resources