Parsing html elements using BeautifulSoup - python

Suppose I have:
<tr>
<td class="prodSpecAtribute">word</td>
<td colspan="5">
another_word
</td>
</tr>
I want to extract text in 2 td classes (word and another_word:
So I used BeautifulSoup:
This is the code Matijn Pieters was asking for:
Basically, it grabs info from html page (from a table) and stores these values in a left and right column list. Then, I create a dictionary from this details (using the left col list as the key, and for the values, I use the right col list)
def get_data(page):
soup = BeautifulSoup(page)
left = []
right = []
#Obtain data from table and store into left and right columns
#Iterate through each row
for tr in soup.findAll('tr'):
#Find all table data(cols) in that row
tds = tr.findAll('td')
#Make sure there are 2 elements, a col and a row
if len(tds) >= 2:
#Find each entry in a row -> convert to text
right_col = []
inp = []
once = 0
no_class = 0
for td in tds:
if once == 0:
#Check if of class 'prodSpecAtribute'
if check(td) == True:
left_col = td.findAll(text=True)
left_col_x = re.sub('&\w+;', '', str(left_col[0]))
once = 1
else:
no_class = 1
break
else:
right_col = td.findAll(text=True)
right_col_x = ' '.join(text for text in right_col if text.strip())
right_col_x = re.sub('&\w+;', '', right_col_x)
inp.append(right_col_x)
if no_class == 0:
inps = '. '.join(inp)
left.append(left_col_x)
right.append(inps)
#Create a Dictionary for left and right cols
item = dict(zip(left, right))
return item

You may use HTQL (http://htql.net).
Here is for your example:
import htql
page="""
<tr>
<td class="prodSpecAtribute">word</td>
<td colspan="5">
another_word
</td>
</tr>
"""
query = """
<tr>{
c1 = <td (class='prodSpecAtribute')>1 &tx;
c2 = <td>2 &tx &trim;
}
"""
a=htql.query(page, query)
print(dict(a))
It prints:
{'word': 'another_word'}

Related

How to get numbers from html?

I want to get
the number after: page=
the number after: "new">
the number after: /a>-
<td> </td>
<td> qwqwqwqwqw <br/> qwqwqwqwqw 4449-4450<br/> </td>
<td> </td>
<td> qwqwqwqwqw <br/> qwqwqwqwqw 5111-5550<br/> </td>
<td> </td>
...
My code
tables = soup.find_all('a', attrs={'target': 'new'})
gives my only a list (see below) without the third number
[4449,
5111,
...]
her is how i would try to extract the 3 numbers from my list, once it has the third digit in it.
list_of_number1 = []
list_of_number2 = []
list_of_number3 = []
regex = re.compile("page=(\d+)")
for table in tables:
number1 = filter(regex.match, tables)
number2 = table.next_sibling.strip()
number3 =
list_of_number1.append(number1)
list_of_number2.append(number2)
list_of_number3.append(number3)
Do i use beautifulsoup for the third number or is it feasible to regex through the whole html for any number following "/a>-"
Here is how you can obtain your result using just the information that you need to get the numbers in the specific a element and in the text node immediately on the right:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "html.parser")
tables = soup.find_all('a', attrs={'target': 'new'})
print([(t.text, t["href"].split('=')[-1], t.next_sibling.replace('-', '')) for t in tables])
# => [('4449', '99', '4450'), ('5111', '77', '5550')]
You may certainly go the harder way with regexps:
import re
#... the same initialization code as above
for t in tables:
page = ""
page_m = re.search(r"[#?]page=(\d+)", t["href"])
if page_m:
page = page_m.group(1)
else:
page = ""
num = "".join([x for x in t.next_sibling if x.isdigit()])
results.append((int(t.text), int(page), int(num)))
print(results)
# => [(4449, 99, 4450), (5111, 77, 5550)]
NOTE:
t.text - gets the element text
t["href"] - gets the href attribute value of the t element
t.next_sibling - gets the next node after current one that is on the same hierarchy level.
You can also try:
for b in soup.select('a'):
print(b.attrs['href'].split('=')[1], b.text, b.nextSibling)
Output:
99 4449 -4450
77 5111 -5550

Converting a HTML table to a CSV in Python

I am trying to convert a table in HTML to a csv in Python. The table I am trying to extract is this one:
<table class="tblperiode">
<caption>Dades de període</caption>
<tr>
<th class="sortable"><span class="tooltip" title="Període (Temps Universal)">Període</span><br/>TU</th>
<th><span class="tooltip" title="Temperatura mitjana (°C)">TM</span><br/>°C</th>
<th><span class="tooltip" title="Temperatura màxima (°C)">TX</span><br/>°C</th>
<th><span class="tooltip" title="Temperatura mínima (°C)">TN</span><br/>°C</th>
<th><span class="tooltip" title="Humitat relativa mitjana (%)">HRM</span><br/>%</th>
<th><span class="tooltip" title="Precipitació (mm)">PPT</span><br/>mm</th>
<th><span class="tooltip" title="Velocitat mitjana del vent (km/h)">VVM (10 m)</span><br/>km/h</th>
<th><span class="tooltip" title="Direcció mitjana del vent (graus)">DVM (10 m)</span><br/>graus</th>
<th><span class="tooltip" title="Ratxa màxima del vent (km/h)">VVX (10 m)</span><br/>km/h</th>
<th><span class="tooltip" title="Irradiància solar global mitjana (W/m2)">RS</span><br/>W/m<sup>2</sup></th>
</tr>
<tr>
<th>
00:00 - 00:30
</th>
<td>16.2</td>
<td>16.5</td>
<td>15.4</td>
<td>93</td>
<td>0.0</td>
<td>6.5</td>
<td>293</td>
<td>10.4</td>
<td>0</td>
</tr>
<tr>
<th>
00:30 - 01:00
</th>
<td>16.4</td>
<td>16.5</td>
<td>16.1</td>
<td>90</td>
<td>0.0</td>
<td>5.8</td>
<td>288</td>
<td>8.6</td>
<td>0</td>
</tr>
And I want it to look something like this:
To achieve so, what I have tried is to parse the html and I have managed to build a dataframe with the data correctly doing the following:
from bs4 import BeautifulSoup
import csv
html = open("table.html").read()
soup = BeautifulSoup(html)
table = soup.select_one("table.tblperiode")
output_rows = []
for table_row in table.findAll('tr'):
columns = table_row.findAll('td')
output_row = []
for column in columns:
output_row.append(column.text)
output_rows.append(output_row)
df = pd.DataFrame(output_rows)
print(df)
However, I would like to have the columns name and a column indicating the interval of time, in the example of html above just two of them appear 00:00-00:30 and 00:30 1:00. Therefore my table should have two rows, one corresponding with the observations of 00:00-00:30 and another one with the observations of 00:30 and 1:00.
How could I get this information from my HTML?
Here's a way of doing it, it's probably not the nicest way but it works! You can read through the comments to figure out what the code is doing!
from bs4 import BeautifulSoup
import csv
#read the html
html = open("table.html").read()
soup = BeautifulSoup(html, 'html.parser')
# get the table from html
table = soup.select_one("table.tblperiode")
# find all rows
rows = table.findAll('tr')
# strip the header from rows
headers = rows[0]
header_text = []
# add the header text to array
for th in headers.findAll('th'):
header_text.append(th.text)
# init row text array
row_text_array = []
# loop through rows and add row text to array
for row in rows[1:]:
row_text = []
# loop through the elements
for row_element in row.findAll(['th', 'td']):
# append the array with the elements inner text
row_text.append(row_element.text.replace('\n', '').strip())
# append the text array to the row text array
row_text_array.append(row_text)
# output csv
with open("out.csv", "w") as f:
wr = csv.writer(f)
wr.writerow(header_text)
# loop through each row array
for row_text_single in row_text_array:
wr.writerow(row_text_single)
With this script:
import csv
from bs4 import BeautifulSoup
html = open('table.html').read()
soup = BeautifulSoup(html, features='lxml')
table = soup.select_one('table.tblperiode')
rows = []
for i, table_row in enumerate(table.findAll('tr')):
if i > 0:
periode = [' '.join(table_row.findAll('th')[0].text.split())]
data = [x.text for x in table_row.findAll('td')]
rows.append(periode + data)
header = ['Periode', 'TM', 'TX', 'TN', 'HRM', 'PPT', 'VVM', 'DVM', 'VVX', 'PM', 'RS']
with open('result.csv', 'w', newline='') as f:
w = csv.writer(f)
w.writerow(header)
w.writerows(rows)
I've managed to generate following CSV file on output:
Periode,TM,TX,TN,HRM,PPT,VVM,DVM,VVX,PM,RS
00:00 - 00:30,16.2,16.5,15.4,93,0.0,6.5,293,10.4,0
00:30 - 01:00,16.4,16.5,16.1,90,0.0,5.8,288,8.6,0
import csv
from bs4 import BeautifulSoup
import pandas as pd
html = open('test.html').read()
soup = BeautifulSoup(html, features='lxml')
#Specify table name which you want to read.
#Example: <table class="queryResults" border="0" cellspacing="1">
table = soup.select_one('table.queryResults')
def get_all_tables(soup):
return soup.find_all("table")
tbls = get_all_tables(soup)
for i, tablen in enumerate(tbls, start=1):
print(i)
print(tablen)
def get_table_headers(table):
headers = []
for th in table.find("tr").find_all("th"):
headers.append(th.text.strip())
return headers
head = get_table_headers(table)
#print(head)
def get_table_rows(table):
rows = []
for tr in table.find_all("tr")[1:]:
cells = []
# grab all td tags in this table row
tds = tr.find_all("td")
if len(tds) == 0:
# if no td tags, search for th tags
# can be found especially in wikipedia tables below the table
ths = tr.find_all("th")
for th in ths:
cells.append(th.text.strip())
else:
# use regular td tags
for td in tds:
cells.append(td.text.strip())
rows.append(cells)
return rows
table_rows = get_table_rows(table)
#print(table_rows)
def save_as_csv(table_name, headers, rows):
pd.DataFrame(rows, columns=headers).to_csv(f"{table_name}.csv")
save_as_csv("Test_table", head, table_rows)

scraping a table from wikipedia with python : can't get a column

I am trying to scrape a table from Wikipedia
<tr>
<td>1</td>
<td><span class="nowrap"><span class="datasortkey" data-sort-value="Etats unis"><span class="flagicon"><a class="image" href="/wiki/Fichier:Flag_of_the_United_States.svg" title="Drapeau des États-Unis"><img alt="Drapeau des États-Unis" class="noviewer thumbborder" data-file-height="650" data-file-width="1235" height="11" src="//upload.wikimedia.org/wikipedia/commons/thumb/a/a4/Flag_of_the_United_States.svg/20px-Flag_of_the_United_States.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/a/a4/Flag_of_the_United_States.svg/30px-Flag_of_the_United_States.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/a/a4/Flag_of_the_United_States.svg/40px-Flag_of_the_United_States.svg.png 2x" width="20" /></a> </span>États-Unis</span></span></td>
<td>19 390,60 </td>
</tr>
as you have noticed there are 3 columns, and here is the code i'm using
A = []
B = []
C = []
for row in DataFondMonetaireInt.findAll("tr"):
cells = row.findAll("td")
if len(cells) == 3:
A.append(cells[0].find(text=True))
B.append(cells[1].find(text=True))
C.append(cells[2].find(text=True))
It works well for A and C but not for B, i can't get the country name (in the example : Etats Unis)
why doesn't it work ?
thank you in advance,
use .text instead of .find(text=True)
DataFondMonetaireInt = BeautifulSoup(html_text, "html.parser")
A = []
B = []
C = []
for row in DataFondMonetaireInt.findAll("tr"):
cells = row.findAll("td")
if len(cells) == 3:
A.append(cells[0].text)
B.append(cells[1].text.strip())
C.append(cells[2].text)
You could do the following to get each table
import pandas as pd
tables = pd.read_html("https://fr.wikipedia.org/wiki/Liste_des_pays_par_PIB_nominal")
[tables[i] for i in range(3)]
You can also use Wikipedia API to get the WikiText data :
import requests
import wikitextparser as wtp
import re
r = requests.get(
'https://fr.wikipedia.org/w/api.php',
params = {
'action': 'parse',
'page': 'Liste_des_pays_par_PIB_nominal',
'contentmodel': 'wikitext',
'prop': 'wikitext',
'format': 'json'
}
)
data = wtp.parse(r.json()['parse']['wikitext']['*'])
f = re.compile(r'[0-9]+[.[0-9]+]?')
for i in range(1, 4):
print([
(t[0], wtp.parse(t[1]).templates[0].name, float(f.findall(t[2])[0]))
for t in data.tables[i].data()
if len(wtp.parse(t[1]).templates) > 0
])
The above will give you data from the 3 tables using WikiTextParser library

Splitting HTML text by <br> while using beautifulsoup

HTML code:
<td> <label class="identifier">Speed (avg./max):</label> </td> <td class="value"> <span class="block">4.5 kn<br>7.1 kn</span> </td>
I need to get values 4.5 kn and 7.1 as separate list items so I could append them separately. I do not want to split it I wanted to split the text string using re.sub, but it does not work. I tried too use replace to replace br, but it did not work. Can anybody provide any insight?
Python code:
def NameSearch(shipLink, mmsi, shipName):
from bs4 import BeautifulSoup
import urllib2
import csv
import re
values = []
values.append(mmsi)
values.append(shipName)
regex = re.compile(r'[\n\r\t]')
i = 0
with open('Ship_indexname.csv', 'wb')as f:
writer = csv.writer(f)
while True:
try:
shipPage = urllib2.urlopen(shipLink, timeout=5)
except urllib2.URLError:
continue
except:
continue
break
soup = BeautifulSoup(shipPage, "html.parser") # Read the web page HTML
#soup.find('br').replaceWith(' ')
#for br in soup('br'):
#br.extract()
table = soup.find_all("table", {"id": "vessel-related"}) # Finds table with class table1
for mytable in table: #Loops tables with class table1
table_body = mytable.find_all('tbody') #Finds tbody section in table
for body in table_body:
rows = body.find_all('tr') #Finds all rows
for tr in rows: #Loops rows
cols = tr.find_all('td') #Finds the columns
for td in cols: #Loops the columns
checker = td.text.encode('ascii', 'ignore')
check = regex.sub('', checker)
if check == ' Speed (avg./max): ':
i = 1
elif i == 1:
print td.text
pat=re.compile('<br\s*/>')
print pat.sub(" ",td.text)
values.append(td.text.strip("\n").encode('utf-8')) #Takes the second columns value and assigns it to a list called Values
i = 0
#print values
return values
NameSearch('https://www.fleetmon.com/vessels/kind-of-magic_0_3478642/','230034570','KIND OF MAGIC')
Locate the "Speed (avg./max)" label first and then go to the value via .find_next():
from bs4 import BeautifulSoup
data = '<td> <label class="identifier">Speed (avg./max):</label> </td> <td class="value"> <span class="block">4.5 kn<br>7.1 kn</span> </td>'
soup = BeautifulSoup(data, "html.parser")
label = soup.find("label", class_="identifier", text="Speed (avg./max):")
value = label.find_next("td", class_="value").get_text(strip=True)
print(value) # prints 4.5 kn7.1 kn
Now, you can extract the actual numbers from the string:
import re
speed_values = re.findall(r"([0-9.]+) kn", value)
print(speed_values)
Prints ['4.5', '7.1'].
You can then further convert the values to floats and unpack into separate variables:
avg_speed, max_speed = map(float, speed_values)

Pick up texts from a batch of files, and write them into an Excel file

(Environment: Python 2.7.6 Shell IDLE + BeautifulSoup 4.3.2 + )
I want to pick up some texts from a batch of files (about 50 files), and put them nicely into an Excel file, either row by row, or column by column.
The text sample in each file contains below:
<tr>
<td width=25%>
Arnold Ed
</td>
<td width=15%>
18 Feb 1959
</td>
</tr>
<tr>
<td width=15%>
男性
</td>
<td width=15%>
02 March 2002
</td>
</tr>
<tr>
<td width=15%>
Guangxi
</td>
</tr>
What I so far worked out are being shown below. The way is to read the files one by one. The codes run fine until the texts pickup part, but they are not writing into the Excel file.
from bs4 import BeautifulSoup
import xlwt
list_open = open("c:\\file list.txt")
read_list = list_open.read()
line_in_list = read_list.split("\n")
for each_file in line_in_list:
page = open(each_file)
soup = BeautifulSoup(page.read())
all_texts = soup.find_all("td")
for a_t in all_texts:
a = a_t.renderContents()
#"print a" here works ok
book = xlwt.Workbook(encoding='utf-8', style_compression = 0)
sheet = book.add_sheet('namelist', cell_overwrite_ok = True)
sheet.write (0, 0, a)
book.save("C:\\details.xls")
Actually it’s only writing the last piece of texts into the Excel file. So in what way I can have it correctly done?
With laike9m's help, the final version is:
list_open = open("c:\\file list.txt")
read_list = list_open.read()
line_in_list = read_list.split("\n")
book = xlwt.Workbook(encoding='utf-8', style_compression = 0)
sheet = book.add_sheet('namelist', cell_overwrite_ok = True)
for i,each_file in enumerate(line_in_list):
page = open(each_file)
soup = BeautifulSoup(page.read())
all_texts = soup.find_all("td")
for j,a_t in enumerate(all_texts):
a = a_t.renderContents()
sheet.write (i, j, a)
book.save("C:\\details.xls")
You didn't put the last four lines into for loop. I guess that's why it’s only writing the last piece of texts into the Excel file.
EDIT
book = xlwt.Workbook(encoding='utf-8', style_compression = 0)
sheet = book.add_sheet('namelist', cell_overwrite_ok = True)
for i, each_file in enumerate(line_in_list):
page = open(each_file)
soup = BeautifulSoup(page.read())
all_texts = soup.find_all("td")
for j, a_t in enumerate(all_texts):
a = a_t.renderContents()
sheet.write(i, j, a)
book.save("C:\\details.xls")

Categories

Resources