How do I export html table to csv file using python?

How do I export html table to csv file using python? - python

I scraped a html table from yahoofinance website and tried to export the table to csv file. However, it does not return the correct output in the csv file. The printed output on my terminal appears to be just fine. What have I done wrong here?
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
mystocks = ["XOM", "CVX", "COP", "EOG"]
stockdata = []
def getData(symbol):
headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0"}
url = f"https://finance.yahoo.com/quote/{symbol}/key-statistics"
soup = BeautifulSoup(requests.get(url, headers=headers).content, "html.parser")
print("Ticker - "+symbol)
for t in soup.select("table"):
for tr in t.select("tr:has(td)"):
for sup in tr.select("sup"):
sup.extract()
stockdata = [td.get_text(strip=True) for td in tr.select("td")]
if len(stockdata) == 2:
print("{:<50} {}".format(*stockdata))
for item in mystocks:
stockdata.append(getData(item))
df = pd.DataFrame(stockdata)
df.to_csv('file_name.csv')

You are printing, not returning the data.
If you want all the data in one table it is good to add a column with the symbol for which the row was originated. You could use something like this
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
mystocks = ["XOM", "CVX", "COP", "EOG"]
stockdata = []
def getData(symbol):
headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0"}
url = f"https://finance.yahoo.com/quote/{symbol}/key-statistics"
soup = BeautifulSoup(requests.get(url, headers=headers).content, "html.parser")
print("Ticker - "+symbol)
for t in soup.select("table"):
for tr in t.select("tr:has(td)"):
for sup in tr.select("sup"):
sup.extract()
stockdata = [td.get_text(strip=True) for td in tr.select("td")]
if len(stockdata) == 2:
# add a column with the symbol to help affterwards
yield [item] + stockdata
# this will concatenate the rows for all the symbols in mystocks
df = pd.DataFrame([r for item in mystocks for r in getData(item)])
df.to_csv('file_name.csv')

Related

How to grab some part of the link inside the td tag in python

I'm trying to grab the link inside a td. My code does not display the link or produce the desired output. What I need to change.
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
from time import sleep
import requests
headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0"}
urllink = "https://bscscan.com/txs?block=11711353&ps=100&p=1"
reqblockdetails = requests.get(urllink, headers=headers, timeout=5)
soupblockdetails = BeautifulSoup(reqblockdetails.content, 'html.parser')
rowsblockdetails = soupblockdetails.findAll('table')[0].findAll('tr')
sleep(1)
for row in rowsblockdetails[1:]:
txnhash = row.find_all('td')[1].text[0:]
txnhashdetails = txnhash.strip()
destination = row.find_all('td')[8].text[0:]
destination = destination.strip()
if str(destination) == "CoinOne: CONE Token":
urldest = soupblockdetails.find('a', attrs={'class': 'hash-tag text-truncate'}).text
print (" {:>1} {:<5}".format(txnhashdetails, destination))
print (urldest)
else:
pass
Current Output:
0x8265a6ba5ce531df645b883e8735af57241f43e92eb3c9a88f43b89310f964bc CoinOne: CONE Token Validator: Stake2me
Needed Output:
0x8265a6ba5ce531df645b883e8735af57241f43e92eb3c9a88f43b89310f964bc CoinOne: CONE Token 0x9628735017f1a985ebaac0b203efb9e8d3ed0fef

It would be better to search for <a> element in currently selected <td> but not in whole document so I changed code to td = row.find_all('td')[8] and later to td.find('a', ...).
Here is a working code:
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
from time import sleep
import requests
headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0"}
urllink = "https://bscscan.com/txs?block=11711353&ps=100&p=1"
reqblockdetails = requests.get(urllink, headers=headers, timeout=5)
soupblockdetails = BeautifulSoup(reqblockdetails.content, 'html.parser')
rowsblockdetails = soupblockdetails.findAll('table')[0].findAll('tr')
sleep(1)
for row in rowsblockdetails[1:]:
txnhash = row.find_all('td')[1].text[0:]
txnhashdetails = txnhash.strip()
td = row.find_all('td')[8]
destination = td.text[0:].strip()
if str(destination) == "CoinOne: CONE Token":
urldest = td.find('a', attrs={'class': 'hash-tag text-truncate'})["href"].lstrip("/address/")
print (" {:>1} {:<5}".format(txnhashdetails, destination))
print (urldest)
else:
pass

Hope, it will work. try this:
t_link = soupblockdetails.find('span', attrs={'class': 'hash-tag text-truncate'})
urldest = t_link.a['href']

Loop page with beautifulsoup

I would scraper urls of player of all pages from this website https://www.transfermarkt.it/detailsuche/spielerdetail/suche/27564780
but I can scrape only the first one, why?
I write a cicle for with range()
import pandas as pd
import requests
from bs4 import BeautifulSoup
list_url=[]
def get_player_urls(page):
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:87.0) Gecko/20100101 Firefox/87.0"
}
link = 'https://www.transfermarkt.it/detailsuche/spielerdetail/suche/27564780/page/{page}'
content = requests.get(link, headers=headers)
soup = BeautifulSoup(content.text, 'html.parser')
for urls in soup.find_all('a', class_='spielprofil_tooltip'):
url = 'https://www.transfermarkt.it' + urls.get('href')
print(url)
list_url.append(url)
return
for page in range(1,11,1):
get_player_urls(page)
df_url = pd.DataFrame(list_url)
df_url.to_csv('df_url.csv', index=False, header=False)

You're not actually imputing the page into the url. Also, no need to put return on your function. You aren't returning anything:
import pandas as pd
import requests
from bs4 import BeautifulSoup
list_url=[]
def get_player_urls(page):
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:87.0) Gecko/20100101 Firefox/87.0"
}
link = 'https://www.transfermarkt.it/detailsuche/spielerdetail/suche/27564780/page/{page}'.format(page=page) #<-- Add this
content = requests.get(link, headers=headers)
soup = BeautifulSoup(content.text, 'html.parser')
for urls in soup.find_all('a', class_='spielprofil_tooltip'):
url = 'https://www.transfermarkt.it' + urls.get('href')
print(url)
list_url.append(url)
for page in range(1,11,1):
get_player_urls(page)
df_url = pd.DataFrame(list_url)
df_url.to_csv('df_url.csv', index=False, header=False)

Python: BeautifulSoup find attribute

I'd like to scrape a webite to get some percentages. So far this is the code:
import requests
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
lista=[]
site = 'https://es.investing.com/indices/indices-futures'
harware = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:84.0) Gecko/20100101 Firefox/84.0'}
request = Request(site,headers=harware)
page = urlopen(request)
soup = BeautifulSoup(page, 'html.parser')
#print(soup)
cotizacion = soup.find_all('td',{"class": "datatable_cell__3gwri datatable_cell--align-end__Wua8C datatable_cell--" + "down__2CL8n" +" datatable_cell--bold__3e0BR table-browser_col-chg-pct__9p1T3"})
for datos in cotizacion:
indices = datos.get_text()
lista.append(indices)
print(lista)
With this, I am getting a bunch of percentages in a list.
But my problem is that the class attribute only gets data when the percentage is negative because the class name is for down ("down__2CL8n"), but when it is up the class name is the same except for that part ("up__2984w"). And I wanna get both, positive and negative.
So I tryed the find with:
soup.find_all('td',{"class": "datatable_cell__3gwri datatable_cell--align-end__Wua8C datatable_cell--" + "down__2CL8n" or "up__2984w" +" datatable_cell--bold__3e0BR table-browser_col-chg-pct__9p1T3"})
But that doesn't work.
How will be the format to get a variable part of the string?

The desired output is under the attribute table-browser_col-chg-pct__9p1T3, To only select the first table you can use a CSS Selector .mb-6 td.table-browser_col-chg-pct__9p1T3.
import requests
from bs4 import BeautifulSoup
URL = "https://es.investing.com/indices/indices-futures"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:84.0) Gecko/20100101 Firefox/84.0"
}
soup = BeautifulSoup(requests.get(URL, headers=headers).content, "html.parser")
print([tag.text for tag in soup.select(".mb-6 td.table-browser_col-chg-pct__9p1T3")])
Output:
['+0,12%', '+0,73%', '+1,97%', '+0,95%', '+1,13%', '+0,03%', '-0,15%', '-0,73%', '-0,05%', '+0,22%', '-0,65%', '-0,16%', '-0,37%', '-0,21%', '+0,11%', '-0,41%', '-0,40%', '-0,15%', '-0,38%', '+0,69%', '-0,89%', '-1,13%', '+0,23%', '-0,89%', '-0,75%', '-1,51%', '-0,22%', '+0,43%', '-1,27%', '+0,92%']

I would avoid what could be dynamic class values and instead determine which column the desired values fall under; then use :nth-of-type to slice out that column from the table. To get the table I would go with an attribute = value selector to get the parent element with data-test=price-table, then move to the child table element with a descendant combinator. The aim would be to try and develop something more robust over time. Of course, this introduces a header string dependency in particular.
import requests
from bs4 import BeautifulSoup
URL = "https://es.investing.com/indices/indices-futures"
headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:84.0) Gecko/20100101 Firefox/84.0"}
soup = BeautifulSoup(requests.get(URL, headers=headers).content, "html.parser")
index = [i.text for i in soup.select('[data-test=price-table] table th')].index('% Var.') + 1
print([i.text for i in soup.select(f"[data-test=price-table] table td:nth-of-type({index})")])
You could also just use pandas read_html:
import pandas as pd
table = pd.read_html('https://es.investing.com/indices/indices-futures')[0]
table['% Var.']

You could just do this as next step(assuming the order doesn't matter):
cotizacion += soup.find_all('td',{"class": "datatable_cell__3gwri datatable_cell--align-end__Wua8C datatable_cell--" + "up__2984w" +" datatable_cell--bold__3e0BR table-browser_col-chg-pct__9p1T3"})
Edit:
As suggest by comments that the order matter, you can refer to this ans:
https://stackoverflow.com/a/14257743/8651239

Extracting table data using beautifulsoup in python

I have a webpage - https://www.1800wheelchair.com/category/369/transport-wheelchairs/ from which I want to extract name, url, sku and specifications (from table) of each product. I wrote the code below but I am getting an empty excel file. I have been trying to fix it for long but cant think of what is going wrong.
import requests
import xlsxwriter
from bs4 import BeautifulSoup
def cpap_spider(max_pages):
global row_i
page=1
while page<=max_pages:
url= "https://www.1800wheelchair.com/category/369/transport-wheelchairs/?p=" +str(page)
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
soup = BeautifulSoup(requests.get(url, headers=headers).content, 'html.parser')
for link in soup.findAll("h2", {"class":"product-name"}):
href=link.find("a")['href']
title = link.string
worksheet.write(row_i, 0, title)
each_item(href)
print(href)
#print(title)
page+=1
def each_item(item_url):
global cols_names, row_i
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
soup = BeautifulSoup(requests.get(item_url, headers=headers).content, 'html.parser')
table=soup.find("table", {"class":"specifications "})
if table:
table_rows = table.find_all('tr')
else:
return
for row in table_rows:
cols = row.find_all('td')
for ele in range(0,len(cols)):
temp = cols[ele].text.strip()
if temp:
if temp[-1:] == ":":
temp = temp[:-1]
# Name of column
if ele == 0:
try:
cols_names_i = cols_names.index(temp)
except:
cols_names.append(temp)
cols_names_i = len(cols_names) - 1
worksheet.write(0, cols_names_i + 1, temp)
continue;
worksheet.write(row_i, cols_names_i + 1, temp)
row_i += 1
cols_names=[]
cols_names_i = 0
row_i = 1
workbook = xlsxwriter.Workbook('all_appended.xlsx')
worksheet = workbook.add_worksheet()
worksheet.write(0, 0, "Title")
cpap_spider(1)
workbook.close()

You have an extra space in your class name {"class":"specifications "}), removed and the excel file was generated with multiple specs columns and data lines.
As a suggestion, if you're willing to add some extra libraries, you can use pandas do read the specifications table as data frames with pd.read_html and use the included function df.to_excel to write an excel file (which can use the same engine xlsxwriter you're already using) without worrying about incrementing rows and columns.
import requests
from bs4 import BeautifulSoup
import pandas as pd
from functools import reduce
AGENT = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
BASE_URL = "https://www.1800wheelchair.com/"
CATG_URL = "category/369/transport-wheelchairs/?p="
def cpap_spider(max_pages):
chair_names = ["Specs"]
chair_tables = ''
page = 1
while page <= max_pages:
url = BASE_URL+CATG_URL+str(page)
soup = BeautifulSoup(requests.get(
url, headers=AGENT).content, 'html.parser')
for link in soup.findAll("h2", {"class": "product-name"}):
href = link.find("a")['href']
title = link.string
chair_name = href.replace(BASE_URL+"product/","")
chair_names.append(chair_name[:20])
chair_tables += each_item(href)
print(href)
page += 1
return [chair_names, chair_tables]
def each_item(item_url):
soup = BeautifulSoup(requests.get(
item_url, headers=AGENT).content, 'html.parser')
table = soup.find("table", {"class": "specifications"})
if table:
return str(table)
chair_name, chair_list = cpap_spider(1)
# create a list of dataframes from html tables
df = pd.read_html(chair_list)
# merge the spec. tables list into one dataframe
all_chairs = reduce(lambda left, right: pd.merge(left, right, on=[0], how='outer'), df)
# add chair names as indices
all_chairs.columns = chair_name
all_chairs.set_index("Specs", drop=True, inplace=True)
# transpose to get chairs as index and specs as columns
all_chairs = all_chairs.T
all_chairs.to_excel("all_appended.xlsx")
Output from all_appended.xlsx

Python BeautifulSoup4 not nesting/iterating

I'm trying to scrape product data of off adidas and nike and am successful in getting the first product's details, but can't get it to iterate through to any additional products. It looks like this is because bs4 is not nesting items inside the I'm pointing to into a list and therefore not allowing me to iterate. I've tried searching around for solutions, but most things I read just point to changing the parser to lxml or html5. I don't think its an issue with the site's html since I'm getting the same issue for both nike and adidas. Both chunks of code are:
Nike
from bs4 import BeautifulSoup
import requests
import pandas as pd
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
url = 'https://www.nike.com/w/mens-shoes-nik1zy7ok'
page = requests.get(url, headers=headers)
if page.status_code == 200:
soup = BeautifulSoup(page.content, 'html.parser')
product_grid = soup.findAll('div', attrs={'class': 'product-grid__items'})
#print(product_grid)
products = []
for card in product_grid:
name = card.find('a', attrs={'class': 'product-card__link-overlay'})
products.append(name.text)
print(products)
Reutrns
['Nike Air VaporMax Flyknit 3']
Adidas
from bs4 import BeautifulSoup
import requests
import pandas as pd
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
url = 'https://www.adidas.com/us/men-shoes'
page = requests.get(url, headers=headers)
if page.status_code == 200:
soup = BeautifulSoup(page.content, 'html5lib')
product_grid = soup.findAll('div', attrs={'class': 'product-container___3GvlZ'})
#print(product_grid)
products = []
for card in product_grid:
name = card.find('div', attrs={'class': 'gl-product-card__name'})
products.append(name.text)
print(products)
Returns
['NMD_R1 Shoes']

You can check the following code:
from bs4 import BeautifulSoup
import requests
import pandas as pd
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
url = 'https://www.nike.com/w/mens-shoes-nik1zy7ok'
page = requests.get(url, headers=headers)
if page.status_code == 200:
soup = BeautifulSoup(page.content, 'html.parser')
product_grid = soup.findAll('div', attrs={'class': 'product-grid__items'})
products = []
for card in product_grid:
names = card.findAll('a', attrs={'class': 'product-card__link-overlay'})
for element in names:
products.append(element.text)
print(products)
The issue was in: name = card.find('a', attrs={'class': 'product-card__link-overlay'}).
If you print it out, you get a single name because you are doing .find not .findAll

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How do I export html table to csv file using python? - python

Related

How to grab some part of the link inside the td tag in python

Loop page with beautifulsoup

Python: BeautifulSoup find attribute

Extracting table data using beautifulsoup in python

Python BeautifulSoup4 not nesting/iterating

Categories

Resources