Adding Data from Beautiful Soup table to a list

Adding Data from Beautiful Soup table to a list - python

Hello I'm a beginner to python and programming in general, and I was wondering how I would make the outputted data a list. I used bs to extract data from a table and attempt to make a list with the data, but I end up only adding the first number to the list. Can someone provide me assistance and an explaination?
from bs4 import BeautifulSoup
from requests_html import HTMLSession
s = HTMLSession()
url = 'https://www.timeanddate.com/weather/usa/new-york/ext'
def get_data(url):
r = s.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
return soup
with open('document.txt', 'a') as f:
f.write(str(get_data(url)))
with open('document.txt', 'r') as html_file:
contents = html_file.read()
soup = BeautifulSoup(contents, 'lxml')
forecast_table = soup.find('table', class_ = 'zebra tb-wt fw va-m tb-hover')
wtitle = soup.title.text
print(wtitle)
print("------")
def get_weather_high(forecast_table):
print("Weather Highs:")
for high in forecast_table.find_all('tbody'):
rows1 = high.find_all('tr')
for row1 in rows1:
pl_high = row1.find_all('td')
pl_high = [td.text.strip() for td in pl_high]
pl_high = pl_high[1:2]
for pl_high_final in pl_high:
pl_high_final = pl_high_final[0:3]
print(pl_high_final)
get_weather_high(forecast_table)
This the output. Instead of each line being a number, I want to have it all under on list

Create a list before your for loop and just append your data instead of printing it and then just print the list after the for loop
data = []
def get_weather_high(forecast_table):
print("Weather Highs:")
for high in forecast_table.find_all('tbody'):
rows1 = high.find_all('tr')
for row1 in rows1:
pl_high = row1.find_all('td')
pl_high = [td.text.strip() for td in pl_high]
pl_high = pl_high[1:2]
for pl_high_final in pl_high:
pl_high_final = pl_high_final[0:3]
data.append(pl_high_final)
print(data) # or return data if you need it some where else

Related

Handling HTML in python

I had a problem when I took out html files and imported them into excel.
This is the site i need to get information: https://www.kylc.com/stats/global/yearly_per_country/g_gdp/vnm.html
As you can see, in the GDP table I have a row named : 年份 separated from 2 lines
That's why after i exported the excel file it gave unexpected results
The result I want is that the first line in excel will only have : 年份 , GDP(美元), 占世界%
Sorry for my confusing explanation, I really don't know how to explain it in detail.
Here is my python code
import requests
from bs4 import BeautifulSoup
import lxml
import csv
def get_html(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
r = "fail"
return r
def getGDP(ulist,html):
soup = BeautifulSoup(html,"html.parser")
trs = soup.find_all('tr')
for tr in trs:
list = []
for th in tr:
ts = th.string
if ts == '\n':
continue
list.append(ts)
ulist.append(list)
def saveGDP(ulist):
file_name = '21095010 胡碧玉 GDP.csv'
with open(file_name,'w',errors='ignore',newline='') as f:
f_csv = csv.writer(f)
f_csv.writerows(ulist)
def main():
unifo=[]
url='https://www.kylc.com/stats/global/yearly_per_country/g_gdp/vnm.html'
html=get_html(url)
getGDP(unifo,html)
saveGDP(unifo)
if __name__=="__main__":
main()
Thank you so much!

Using pandas scraping tables and cleaning of results in most cases is mutch easier - under the hood beautifulsoup is working for you.
In this case read_html() the table, drop the unwanted header level and filter out the rows containings ads:
import pandas as pd
df = pd.read_html('https://www.kylc.com/stats/global/yearly_per_country/g_gdp/vnm.html')[0].droplevel(0, axis=1)
df[~df.iloc[:,0].str.contains('ads')].to_csv('21095010 胡碧玉 GDP.csv', index=False)
Answering your question
You have to select your elements more specific e.g. with css selectors.
So first get the thead information from all th witout colspan, than collect the data from all tr in tbody that do not contains ads:
def getGDP(html):
soup = BeautifulSoup(html,"html.parser")
data = []
data.append([th.text for th in soup.select('thead th:not([colspan])')])
for row in soup.select('tbody tr:not(:-soup-contains("ads"))'):
data.append(list(row.stripped_strings))
return data
Example
import requests
from bs4 import BeautifulSoup
import lxml
import csv
def get_html(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
r = "fail"
return r
def getGDP(html):
soup = BeautifulSoup(html,"html.parser")
data = []
data.append([th.text for th in soup.select('thead th:not([colspan])')])
for x in soup.select('tbody tr:not(:-soup-contains("ads"))'):
data.append(list(x.stripped_strings))
return data
def saveGDP(ulist):
file_name = '21095010 胡碧玉 GDP.csv'
print(ulist)
with open(file_name,'w',errors='ignore', encoding='utf-8') as f:
f_csv = csv.writer(f)
f_csv.writerows(ulist)
def main():
url='https://www.kylc.com/stats/global/yearly_per_country/g_gdp/vnm.html'
html=get_html(url)
saveGDP(getGDP(html))
if __name__=="__main__":
main()

I'm trying to deduplicate weblinks scraped using Python & BeautifulSoup but it's not working

I'm trying to scrape a website in Python, I got the links to print but in trying to make them a set to deduplicate, there are still duplicates. Anyone have any advice on what I am doing wrong? Thanks in advance!
Edit: So I tried what John suggested but my csv output is a cascading list of links across the excel sheet, it's crazy...I'll post the changes below this original code:
import requests
from bs4 import BeautifulSoup
page = "https://www.census.gov/programs-surveys/popest.html"
r = requests.get(page)
raw_html = r.text
soup = BeautifulSoup(raw_html, 'html.parser')
mylink = soup.find_all('a')
print ('The number of links to start with are: ', len(mylink) )
#output = The number of links to start with are: 254
import csv
with open('census_links.csv', 'w', newline='') as f:
weblinks = str(mylink)
writer = csv.writer(f, delimiter = ' ', lineterminator = '\r')
for link in mylink:
hrefs = str(link.get('href'))
if hrefs.startswith("None"):
''
elif hrefs.startswith('http'):
MySet = set()
MySet.add(hrefs)
elif hrefs.startswith('#'):
''
elif hrefs.startswith(' '):
''
print(set(MySet))
file.write(str(MySet)+'\n')
file.close
#Edited code:
import requests
from bs4 import BeautifulSoup
page = "https://www.census.gov/programs-surveys/popest.html"
r = requests.get(page)
raw_html = r.text
soup = BeautifulSoup(raw_html, 'html.parser')
mylink = soup.find_all('a')
print ('The number of links to start with are: ', len(mylink))
# The number of links to start with are: 254
import csv
with open('census_links.csv', 'w', newline='') as f:
weblinks = str(mylink)
writer = csv.writer(f, delimiter = ',', lineterminator = '\r')
MySet = set()
for link in mylink:
hrefs = str(link.get('href'))
if hrefs.startswith("None"):
continue
elif hrefs.startswith('#'):
continue
elif hrefs.startswith(' '):
continue
elif hrefs.startswith('http'):
MySet.add(hrefs)
file.write(str(MySet)+'\n')
file.close
print(str(MySet) +'\n')

to get unique links, you want to check if the link is in MySet with hrefs not in MySet.
for simple operation you don't need csv, to write in single row
"\n".join(MySet)
and to write single column
",".join(MySet)
MySet = set()
for link in mylink:
hrefs = link.get('href')
if not hrefs or hrefs.startswith('#'):
continue
# normalize link
if hrefs.startswith('/'):
hrefs = 'https://www.census.gov' + hrefs
# check if link already in MySet
if hrefs not in MySet:
MySet.add(hrefs)
with open('census_links.csv', 'w', newline='') as f:
f.write("\n".join(MySet))
print("\n".join(MySet))

Initialize the set before the loop, and wait to print it until after the loop is done.
MySet = set()
...
for link in mylink:
hrefs = str(link.get('href'))
...
if hrefs.startswith('http'):
MySet.add(hrefs)
...
print(MySet)

same code part to get content.
import requests
from bs4 import BeautifulSoup
page = "https://www.census.gov/programs-surveys/popest.html"
r = requests.get(page)
raw_html = r.text
soup = BeautifulSoup(raw_html, 'html.parser')
mylink = soup.find_all('a')
print ('The number of links to start with are: ', len(mylink) )
#output = The number of links to start with are: 254
use pandas to get the unique url which starts with http.
import pandas as pd
obj = pd.Series(mylink)
obj_link = obj.map(lambda x: x.get('href')).drop_duplicates().dropna()
cond = obj_link.str.startswith('http')
dfn = obj_link.loc[cond].to_frame()
dfn.shape # (93, 1)
dfn.to_csv('census_links.csv', index=False, header=False)

Beautiful Soup - Results to CSV for all items in lists

The below snippet "works" but is only outputting the first record to the CSV. I'm trying to get it to output the same output, but for each gun in the list of gun urls in the all_links list.
Any modification i've made to it with prints for the output (just to see it working) prints the same result
or if i make a gun_details list and try to print it, get the same one item output.
How would i go about printing all the gun_details labels and spans into a CSV?
import csv
import urllib.request
import requests
from bs4 import BeautifulSoup
all_links = []
url = "https://www.guntrader.uk/dealers/minsterley/minsterley-ranges/guns?page={}"
for page in range(1, 3):
res = requests.get(url).text
soup = BeautifulSoup(res, "html.parser")
for link in soup.select(
'a[href*="dealers/minsterley/minsterley-ranges/guns/shotguns/"]'
):
all_links.append("https://www.guntrader.uk" + link["href"])
for a_link in all_links:
gun_label = []
gun_span = []
res = urllib.request.urlopen(a_link)
# res = requests.get(a_link)
soup = BeautifulSoup(res, "html.parser")
for gun_details in soup.select("div.gunDetails"):
for l in gun_details.select("label"):
gun_label.append(l.text.replace(":", ""))
for s in gun_details.select("span"):
gun_span.append(s.text)
my_dict = dict(zip(gun_label, gun_span))
with open("mycsvfile.csv", "w") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=None)
for key in my_dict.keys():
csvfile.write(f"{key},{my_dict[key]}\n")

Try running the middle section this way:
for a_link in all_links:
gun_label = []
gun_span = []
res = requests.get(a_link)
soup = bs(res.content, 'html.parser') #note it's 'res.content', not just 'res'
for gun_details in soup.select('div.gunDetails'):
for l in gun_details.select('label'):
gun_label.append(l.text.replace(':',''))
for s in gun_details.select('span'):
gun_span.append(s.text)
#this block is now indented differently - it's INSIDE the 'for' loop
my_dict = dict(zip(gun_label, gun_span))
with open('mycsvfile.csv', 'a') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=None)
for key in my_dict.keys():
csvfile.write(f"{key},{my_dict[key]}\n")

How to loop through BS4 data and print div tag correctly

I am trying to copy all the data within an HTML page that has the certain class "chapter_header_styling" with BS4.
This was working when I manually inputed the URL – but is tedious when there are multiple books and various chapters. So I then created another script that would generate all the chapter URLs for the book and combine them into a text file bchap.txt (book chapters).
Since then I have altered the file and added various break points so ignore my lack of comments and unused arrays/list. I have narrowed it down to the ###Comment## where it doesn't work. It's probably not nested right but I'm not sure... I had this working to a point but can't figure out why it won't paste the mydivs data into the book.html file. If anyone with more experience could point me in the right direction much would be appreciated.
#mkbook.py
# coding: utf-8
from bs4 import BeautifulSoup
import requests
LINK = "https://codes.iccsafe.org/content/FAC2017"
pop = ""
#z = ""
chapters = open("bchap.txt",'r')
a = []
for aline in chapters:
chap = aline
#print (chap)
#pop = ""
pop = LINK+chap
#print (pop)
r = requests.get(pop)
data = r.text
#print(data)
soup = BeautifulSoup(data, 'html.parser')
mydivs = soup.findAll("div", {"class": ["annotator", "chapter_header_styling"]})
f = open("BOOK.html","a")
f.write("test <br/>")
########################################
#MY PROBLEM IS BELOW NOT PRINTING DIV DATA INTO TXT FILE
########################################
for div in mydivs:
print (div)
z = str(div)
print(z) #doesn't printout...why???
f.write(z)
print len(mydivs)
f.close()
chapters.close()
##############################################
## this is the old mkbook.py code before I looped it - inputing url 1 # time
#
# coding: utf-8
from bs4 import BeautifulSoup
import requests
r = requests.get("https://codes.iccsafe.org/content/FAC2017/preface")
data = r.text
soup = BeautifulSoup(data, 'html.parser')
a = []
mydivs = soup.findAll("div",{"class":["annotator",
"chapter_header_styling"]})
f = open("BOOK.html","a")
for div in mydivs:
z = str(div)
f.write(z)
f.close()
print len(mydivs) #outputs 1 if copied div data.
#######################################
#mkchap.py
# coding: utf-8
from bs4 import BeautifulSoup
import requests
r = requests.get("https://codes.iccsafe.org/content/FAC2017")
data = r.text
soup = BeautifulSoup(data, 'html.parser')
a = []
soup.findAll('option',{"value":True})
list = soup.findAll('option')
with open('bchap.txt', 'w') as filehandle:
for l in list:
filehandle.write(l['value'])
filehandle.write("\n")
print l['value']
#with open('bchap.txt', 'w') as filehandle:
# filehandle.write("%s\n" % list)
filehandle.close()

The problem seems to be that you are constructing your url using a wrong base url.
LINK = "https://codes.iccsafe.org/content/FAC2017"
If you take a look at your 1st request you can see this clearly.
print(pop)
print(r.status_code)
Outputs:
https://codes.iccsafe.org/content/FAC2017/content/FAC2017
404
After running the code to populate bchap.txt, its output is
/content/FAC2017
/content/FAC2017/legend
/content/FAC2017/copyright
/content/FAC2017/preface
/content/FAC2017/chapter-1-application-and-administration
/content/FAC2017/chapter-2-scoping-requirements
/content/FAC2017/chapter-3-building-blocks
/content/FAC2017/chapter-4-accessible-routes
/content/FAC2017/chapter-5-general-site-and-building-elements
/content/FAC2017/chapter-6-plumbing-elements-and-facilities
/content/FAC2017/chapter-7-communication-elements-and-features
/content/FAC2017/chapter-8-special-rooms-spaces-and-elements
/content/FAC2017/chapter-9-built-in-elements
/content/FAC2017/chapter-10-recreation-facilities
/content/FAC2017/list-of-figures
/content/FAC2017/fair-housing-accessibility-guidelines-design-guidelines-for-accessible-adaptable-dwellings
/content/FAC2017/advisory
Lets change the base url first and try again.
from bs4 import BeautifulSoup
import requests
LINK = "https://codes.iccsafe.org"
pop = ""
chapters = open("bchap.txt",'r')
a = []
for aline in chapters:
chap = aline
pop = LINK+chap
r = requests.get(pop)
print(pop)
print(r.status_code)
chapters.close()
Outputs:
https://codes.iccsafe.org/content/FAC2017
404
...
why? b'coz of the \n. If we do a
print(repr(pop))
It will output
'https://codes.iccsafe.org/content/FAC2017\n'
You'll have to strip away that \n also. The final code that worked is
from bs4 import BeautifulSoup
import requests
LINK = "https://codes.iccsafe.org"
pop = ""
chapters = open("bchap.txt",'r')
a = []
for aline in chapters:
chap = aline
pop = LINK+chap
r = requests.get(pop.strip())
data = r.text
soup = BeautifulSoup(data, 'html.parser')
mydivs = soup.findAll("div", class_="annotator chapter_header_styling")
f = open("BOOK.html","a")
for div in mydivs:
z = str(div)
f.write(z)
f.close()
chapters.close()

Loop in Python script, Only get last results

Why do I only get the stats from the last player in PLAYER_NAME?
I would like to get the stats from all the players in PLAYER_NAME.
import csv
import requests
from bs4 import BeautifulSoup
import urllib
PLAYER_NAME = ["andy-murray/mc10", "rafael-nadal/n409"]
URL_PATTERN = 'http://www.atpworldtour.com/en/players/{}/player-stats?year=0&surfaceType=clay'
for item in zip (PLAYER_NAME):
url = URL_PATTERN.format(item)
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html)
table = soup.find('div', attrs={'class': 'mega-table-wrapper'})
list_of_rows = []
for row in table.findAll('tr'):
list_of_cells = []
for cell in row.findAll('td'):
text = (cell.text.encode("utf-8").strip())
list_of_cells.append(text)
list_of_rows.append(list_of_cells)
outfile = open("./tennis.csv", "wb")
writer = csv.writer(outfile)
writer.writerow(["Name", "Stat"])
writer.writerows(list_of_rows)

As mentioned in the comments, you're recreating list_of_rows every time. To fix that, you have to move it outside the for loop, and instead of appending to it, and turning it into a list of lists, extend it.
On a side note, you have a few other issues with your code:
zip is redundant, and it actually ends up converting your names into tuples, which will cause incorrect formatting, you just want to iterate over PLAYER_NAME, and while you're at it, maybe rename that to PLAYER_NAMES (since it's a list of names)
When trying to format the string you just have empty braces, you need a number in there to specify the position of the argument in format - in this case {0}.
PLAYER_NAMES = ["andy-murray/mc10", "rafael-nadal/n409"]
URL_PATTERN = 'http://www.atpworldtour.com/en/players/{0}/player-stats?year=0&surfaceType=clay'
list_of_rows = []
for item in PLAYER_NAMES:
url = URL_PATTERN.format(item)
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html)
table = soup.find('div', attrs={'class': 'mega-table-wrapper'})
# for row in table.findAll('tr'):
# list_of_cells = []
# for cell in row.findAll('td'):
# text = (cell.text.encode("utf-8").strip())
# list_of_cells.append(text)
# list_of_rows.extend(list_of_cells) # Change to extend here
# Incidentally, the for loop above could also be written as:
list_of_rows += [
[cell.text.encode("utf-8").strip() for cell in row.findAll('td')]
for row in table.findAll('tr')
]

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Adding Data from Beautiful Soup table to a list - python

Related

Handling HTML in python

I'm trying to deduplicate weblinks scraped using Python & BeautifulSoup but it's not working

Beautiful Soup - Results to CSV for all items in lists

How to loop through BS4 data and print div tag correctly

Loop in Python script, Only get last results

Categories

Resources