First, there is a leading "1" in the return string and I'm encountering trouble iterating passed it - I've tried using [0:]: method and get stuck somewhere. I'd like to skip drop it or skip it to get to the second value that is the id val. the scraped table
Additionally, in trying to format returned items from table for storage - I've been getting index out of range errors. I've been using def store().
import requests
from bs4 import BeautifulSoup
import MySQLdb
#mysql portion
mydb = MySQLdb.connect(host='****',
user= '****',
passwd='****',
db='****')
cur = mydb.cursor()
def store (id, ticker):
cur.execute('INSERT IGNORE INTO TEST (id, ticker) VALUES (\"%s\", \"%s\")',(id, ticker))
cur.connection.commit()
base_url = 'http://finviz.com/screener.ashx?v=152&s=ta_topgainers&o=price&c=0,1,2,3,4,5,6,24,25,63,64,65,66,67'
html = requests.get(base_url)
soup = BeautifulSoup(html.content, "html.parser")
main_div = soup.find('div', attrs = {'id':'screener-content'})
table = main_div.find('table')
sub = table.findAll('tr')
cells = sub[5].findAll('td')
for cell in cells:
link = cell.a
if link is not None:
link = link.get_text()
id = link[0]
ticker = link[1]
store(id, ticker)
print(link)
I don't know what you real try to do but this works for me
import requests
from bs4 import BeautifulSoup
base_url = 'http://finviz.com/screener.ashx?v=152&s=ta_topgainers&o=price&c=0,1,2,3,4,5,6,24,25,63,64,65,66,67'
html = requests.get(base_url)
soup = BeautifulSoup(html.content, "html.parser")
rows = soup.find_all('tr', class_=["table-dark-row-cp", "table-light-row-cp"])
for row in rows:
columns = row.find_all('td')
id_ = columns[0].a.get_text()
ticker = columns[1].a.get_text()
company = columns[2].a.get_text()
sector = columns[3].a.get_text()
industry = columns[4].a.get_text()
print(id_, ticker, company, sector, industry)
Or event with a
for row in rows:
columns = row.find_all('a')
id_ = columns[0].get_text()
ticker = columns[1].get_text()
company = columns[2].get_text()
sector = columns[3].get_text()
industry = columns[4].get_text()
print(id_, ticker, company, sector, industry)
BTW: you can also use CSS selector
rows = soup.select('#screener-content table[bgcolor="#d3d3d3"] tr[class]')
or
rows = soup.select('#screener-content table[bgcolor="#d3d3d3"] tr')
# skip first row with headers
rows = rows[1:]
Related
My goal is to sequentially fetch urls from an sqlite3 database and download quotes from a website. the unload routine for a single title, without iterative process works:
import requests
from bs4 import BeautifulSoup as bs
input_str = input("Insert stock's url:")
if input_str=="":
input_str="https://www.teleborsa.it/indici-italia/ftse-mib"
res = requests.get(input_str)
soup = bs(res.content,'lxml')
price = soup.find("span", class_="h-price fc0").text
print("Stock price ",input_str," è ",price)
Problems exist when I use an sqlite database that contains various urls. The sql querty is correct and the table is read but the following code gives an error:
# reading records
for row in rows:
input_str=row[6]
res = request.get(input_str)
soup = bs(res.content,'html.parser')
price = soup.find("span", class_="h-price fc0").text
curdata =soup.find("div", class_="header-bottom fc3").text
I just did a test like below and everything was ok:
rows = list()
[rows.append("https://www.teleborsa.it/indici-italia/ftse-mib")
for _ in range(5)]
for row in rows:
input_str = row
res = requests.get(input_str)
soup = bs(res.content, 'html.parser')
price = soup.find("span", class_="h-price fc0").text
curdata = soup.find("div", class_="header-bottom fc3").text
print(price, "\t\t", curdata)
first you missed an "s" using requests. and the problem is probably the part with the row[6]. make sure you are using correct URLs when retrieving them from database. if you want more help give us some of the database outputs pls.
here is the complete code. For the only one record, code works. I must think to some incorrect urls.
import sqlite3
from openpyxl import Workbook
import datetime
import requests
from bs4 import BeautifulSoup as bs
# connessione
con = sqlite3.connect("/demo.sqlite")
cursor = con.cursor()
cursor.execute("select * from tbLista")
rows = cursor.fetchall()
# preparo il foglio
wb = Workbook()
ws = wb.active
ws.delete_cols(1,2)
ws['A1'] = "Isin"
ws['B1'] = "Price"
riga=1
# lettura records
for row in rows:
riga +=1
ws['A'+str(riga)] =row[4]
ws['B'+str(riga)] =row[5]
input_str=row[5]
res = requests.get(input_str)
soup = bs(res.content,'lxml')
price = soup.find("span", class_="h-price fc0").text
curdata =soup.find("div", class_="header-bottom fc3").text
firstmod=price.replace('.','')
secondmod=float(firstmod.replace(',','.'))
ws['B'+str(riga)] = secondmod
# chiusura
cursor.close()
wb.save("demo.xlsx")
print("ok")
I don't write code about error management in soup
I am new to python. I am using it in a jupyter notebooks to scrape a table from Wikipedia. All the code I wrote works, except when I want to put the information into a csv file. The error that appears is "Index list index out of range".
Here is the code:
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)'
import csv
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
s = requests.Session()
response = s.get(url, timeout=10)
response
table_id = 'main'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
print(soup.prettify().encode('UTF-8'))
table = soup.find('table', attrs={'id': table_id})
for row in table.find_all('tr'):
print(row)
table = soup.find('table', attrs={'id': table_id})
for row in table.find_all('tr')[1:]:
col = row.find_all('td')
print(col[0].find('a').contents[0])
print(col[1].string) #name
print(col[2].string)
print(col[3].string)
print(col[4].string)
print(col[5].find(text=True))
csvfile = open('population.csv', 'w')
csvwriter = csv.writer(csvfile, delimiter=',')
headers = ('COUNTRY','CONTINENT','SUBREGION', 'POPULATION_2018', 'POPULATION_2019', 'CHANGE')
csvwriter.writerow(headers)
table = soup.find('table', attrs={'id': table_id})
for row in table.find_all('tr')[1:]:
col = row.find_all('td')
country = col[0].find('a').contents[0]
continent = col[1].string
subregion = col[2].string
population_2018 = col[3].string
population_2019 = col[4].string
change = col[5].find(text=True)
parsed_row = (country, continent, subregion, population_2018, population_2019, change)
csvwriter.writerow(parsed_row)
csvfile.close()
Thank you very much!
I have two part answers. The easiest way to accomplish your task and where in your code the error is.
Let pandas handle the requests, BeautifulSoup and csv for you.
import pandas as pd
URI = 'https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)'
df = pd.read_html(URI)[3]
df.to_csv('population.csv', index=False)
pandas has .read_html that returns a list of all tables in the webpage. Your table was at index 3. With that, I saved it with .to_csv.
With .read_html, you can pass the attributes of a specific table e.g. attrs = {'id': 'table'}
# the table is now at index 0
df = pd.read_html(URI, attrs={'id':'main'})[0]
You can also specify the parser that will be used by BeautifulSoup that .read_html calls:
df = pd.read_html(URI, attrs={'id':'main'}, flavor='lxml')[0]
# 'lxml' is known for speed. But you can use `html.parser` if `lxml` or `html5lib` are not installed.
See more documentation .read_html
Update: Debugging You’re Code
The error from your code is from empty col. using if conditions solves the problem:
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)'
import csv
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
s = requests.Session()
response = s.get(url, timeout=10)
response
table_id = 'main'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
#print(soup.prettify().encode('UTF-8'))
csvfile = open('population.csv', 'w')
csvwriter = csv.writer(csvfile, delimiter=',')
headers = ('COUNTRY','CONTINENT','SUBREGION', 'POPULATION_2018', 'POPULATION_2019', 'CHANGE')
csvwriter.writerow(headers)
table = soup.find('table', attrs={'id': table_id})
for row in table.find_all('tr')[1:]:
col = row.find_all('td')
# this is all that was missing
if col:
country = col[0].find('a')['title']
continent = col[1].string
subregion = col[2].string
population_2018 = col[3].string
population_2019 = col[4].string
change = col[5].find(text=True)
parsed_row = (country, continent, subregion, population_2018, population_2019, change)
csvwriter.writerow(parsed_row)
csvfile.close()
Prayson W. Daniel has already given the answer, and I offer another way.
import requests
from simplified_scrapy import SimplifiedDoc, utils, req
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)'
s = requests.Session()
res = s.get(url, timeout=10)
rows = []
headers = ('COUNTRY','CONTINENT','SUBREGION', 'POPULATION_2018', 'POPULATION_2019', 'CHANGE')
rows.append(headers)
table_id = 'main'
doc = SimplifiedDoc(res.text)
table = doc.select('table#'+table_id) # Get the table by id.
trs = table.tbody.children.children[1:] # Get all data rows
for tr in trs:
row = [tr[0].a.text] # First col, get first link
row.extend(tr.text[1:]) # Left cols
rows.append(row)
utils.save2csv('test_wiki.csv', rows) # Save data to csv
I'm doing some progress with web scraping however I still need some help to perform some operations:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'http://fcf.cat/equip/1920/1i/sant-ildefons-ue-b'
# soup = BeautifulSoup(requests.get(converturl).content, 'html.parser')
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
out = []
for tr in soup.select('.col-md-4 tbody tr'):
On the class col-md-4 I know there are 3 tables I want to generate a csv which as an output has three values: first name, last name, and for the last value I want the header name of the table.
first name, last name, header table
Any help would be appreciated.
This is what I have done on my own:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'http://fcf.cat/equip/1920/1i/sant-ildefons-ue-b'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
filename = url.rsplit('/', 1)[1] + '.csv'
tables = soup.select('.col-md-4 table')
rows = []
for tr in tables:
t = tr.get_text(strip=True, separator='|').split('|')
rows.append(t)
df = pd.DataFrame(rows)
print(df)
df.to_csv(filename)
Thanks,
This might work:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'http://fcf.cat/equip/1920/1i/sant-ildefons-ue-b'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
tables = soup.select('.col-md-4 table')
rows = []
for table in tables:
cleaned = list(table.stripped_strings)
header, names = cleaned[0], cleaned[1:]
data = [name.split(', ') + [header] for name in names]
rows.extend(data)
result = pd.DataFrame.from_records(rows, columns=['surname', 'name', 'table'])
You need to first iterate through each table you want to scrape, then for each table, get its header and rows of data. For each row of data, you want to parse out the First Name and Last Name (along with the header of the table).
Here's a verbose working example:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'http://fcf.cat/equip/1920/1i/sant-ildefons-ue-b'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
out = []
# Iterate through each of the three tables
for table in soup.select(".col-md-4 table"):
# Grab the header and rows from the table
header = table.select("thead th")[0].text.strip()
rows = [s.text.strip() for s in table.select("tbody tr")]
t = [] # This list will contain the rows of data for this table
# Iterate through rows in this table
for row in rows:
# Split by comma (last_name, first_name)
split = row.split(",")
last_name = split[0].strip()
first_name = split[1].strip()
# Create the row of data
t.append([first_name, last_name, header])
# Convert list of rows to a DataFrame
df = pd.DataFrame(t, columns=["first_name", "last_name", "table_name"])
# Append to list of DataFrames
out.append(df)
# Write to CSVs...
out[0].to_csv("first_table.csv", index=None) # etc...
Whenever you're web scraping, I highly recommend using strip() on all of the text you parse to make sure you don't have superfluous spaces in your data.
I hope this helps!
I'm trying to web scrape a data table in wikipedia using python bs4. But I'm stuck with this problem. When getting the data values my code is not getting the first column or index zero. I feel there something wrong with the index but I can't figure it out. Please help. See the
response_obj = requests.get('https://en.wikipedia.org/wiki/Metro_Manila').text
soup = BeautifulSoup(response_obj,'lxml')
Neighborhoods_MM_Table = soup.find('table', {'class':'wikitable sortable'})
rows = Neighborhoods_MM_Table.select("tbody > tr")[3:8]
cities = []
for row in rows:
city = {}
tds = row.select('td')
city["City or Municipal"] = tds[0].text.strip()
city["%_Population"] = tds[1].text.strip()
city["Population"] = float(tds[2].text.strip().replace(",",""))
city["area_sqkm"] = float(tds[3].text.strip().replace(",",""))
city["area_sqm"] = float(tds[4].text.strip().replace(",",""))
city["density_sqm"] = float(tds[5].text.strip().replace(",",""))
city["density_sqkm"] = float(tds[6].text.strip().replace(",",""))
cities.append(city)
print(cities)
df=pd.DataFrame(cities)
df.head()
import requests
from bs4 import BeautifulSoup
import pandas as pd
def main(url):
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
target = [item.get_text(strip=True) for item in soup.findAll(
"td", style="text-align:right") if "%" in item.text] + [""]
df = pd.read_html(r.content, header=0)[5]
df = df.iloc[1: -1]
df['Population (2015)[3]'] = target
print(df)
df.to_csv("data.csv", index=False)
main("https://en.wikipedia.org/wiki/Metro_Manila")
Output: view-online
I have been trying to extract out the contents inside a table on a website.
descriptions = []
sources = []
values = []
site = 'https://www.eia.gov/todayinenergy/prices.php' #address of the site
driver = webdriver.Chrome(executable_path=r"chromedriver.exe")
driver.execute_script("document.body.style.zoom='100%'")
driver.get(site)
soup_1 = bs(driver.page_source, 'lxml') #clean up the site using beautiful soup
tables = soup_1.find_all('tbody') #script of interest
print(len(tables)) #count the scripts
for table in tables:
rows = table.find_all('tr')
print(len(rows))
for row in rows:
description = row.find('td', class_='s1')
descriptions.append(descri_clean)
source = row.find('td', class_='s2')
sources.append(source_clean)
value = row.find('td', class_='d1') #find the row that gives the data
values.append(value_clean) #compile it all together
driver.close()
I have been trying to get clean text form the table however the data extracted looks like this.
<td class="s1" rowspan="3">Crude Oil<br/> ($/barrel)</td>
While i want something like just ''Crude Oil ($/barrel)
When i tried
description = row.find('td', class_='s1').text.renderContents()
descriptions.append(descri_clean)
The error showed up
AttributeError: 'NoneType' object has no attribute 'renderContents'
You can use just requests. You can filter out your values by doing string matching on expected values for certain class attributes when looping table rows. I set the two tables of interest into separate variables which are lists of the rows within those tables. The tables on the page each have their own distinct class identifier for the table number e.g. t1, t2 ......
from bs4 import BeautifulSoup as bs
import requests
r = requests.get('https://www.eia.gov/todayinenergy/prices.php')
soup = bs(r.content, 'lxml')
table1 = soup.select('.t1 tr')
table2 = soup.select('.t2 tr')
for item in table1:
if 'Crude Oil ($/barrel) - Nymex Apr' in item.text:
rowInfo = [td.text for td in item.select('td')]
print(rowInfo)
elif 'Ethanol ($/gallon) - CBOT Apr' in item.text:
rowInfo = [td.text for td in item.select('td')]
print(rowInfo)
for item in table2:
if len(item.select('td')) == 4:
header = item.select_one('td.s1').text
if item.select_one('td.s2'):
if item.select_one('td.s2').text in ['WTI','Brent','Louisiana Light','Los Angeles'] and header in ['Crude Oil ($/barrel)','Gasoline (RBOB) ($/gallon)']:
rowInfo = [td.text for td in item.select('td')]
print(rowInfo)