How to extract a table from the website using python? - python

I wrote a code to extract table from this website (http://www.nhb.gov.in/OnlineClient/MonthlyPriceAndArrivalReport.aspx), but I am unable to do so.
from selenium import webdriver
import time, re
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
import time
chrome_path = r"C:\Users\user\Desktop\chromedriver_win32\chromedriver.exe"
driver = webdriver.Chrome(chrome_path)
driver.get("http://www.nhb.gov.in/OnlineClient/MonthlyPriceAndArrivalReport.aspx")
html_source = driver.page_source
results=[]
#cauliflower
element_month = driver.find_element_by_id ("ctl00_ContentPlaceHolder1_ddlmonth")
drp_month = Select(element_month)
drp_month.select_by_visible_text("January")
element_category_name = driver.find_element_by_id ("ctl00_ContentPlaceHolder1_drpCategoryName")
drp_category_name = Select(element_category_name)
drp_category_name.select_by_visible_text("VEGETABLES")
time.sleep(2)
element_crop_name = driver.find_element_by_id ("ctl00_ContentPlaceHolder1_drpCropName")
drp_crop_name = Select(element_crop_name)
drp_crop_name.select_by_value("117")
time.sleep(2)
element_variety_name = driver.find_element_by_id ("ctl00_ContentPlaceHolder1_ddlvariety")
drp_variety_name = Select(element_variety_name)
drp_variety_name.select_by_value("18")
element_state = driver.find_element_by_id ("ctl00_ContentPlaceHolder1_LsboxCenterList")
drp_state = Select(element_state)
drp_state.select_by_visible_text("AHMEDABAD")
driver.find_element_by_xpath("""//*[#id="ctl00_ContentPlaceHolder1_btnSearch"]""").click()
soup = BeautifulSoup(driver.page_source, 'html.parser')
table = pd.read_html(driver.page_source)[3]
#number three is arbitrary. I tried all numbers from 1 to 6 and python did not recognize the table at
#the bottom of the screen.
print(len(table))
print(table)
with pd.ExcelWriter(r'C:\Users\user\Desktop\python.xlsx') as writer:
table.to_excel(writer, sheet_name = "cauliflower", index=False) # cauliflower results on sheet named
cauliflower
writer.save()
Can you please help me figure out how to extract the table at the bottom of the website. Your help will be appreciated. Thank you in advance.

You can do that without using Beautiful soup. After click on search button.
Induce WebDriverWait() and wait for visibility_of_element_located()
Get the table element using get_attribute('outerHTML')
Then use pd.read_html(str(tableelement))[0] and print(table)
Rest you can do that to import in excel or csv.
Code:
driver.find_element_by_xpath("//*[#id='ctl00_ContentPlaceHolder1_btnSearch']").click()
tableelement=WebDriverWait(driver,10).until(EC.visibility_of_element_located((By.CSS_SELECTOR,"table#ctl00_ContentPlaceHolder1_GridViewmonthlypriceandarrivalreport"))).get_attribute('outerHTML')
table = pd.read_html(str(tableelement))[0]
print(table)
You need to import below libraries.
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
If you want to use BeautifulSoup as well then try this code.
driver.find_element_by_xpath("//*[#id='ctl00_ContentPlaceHolder1_btnSearch']").click()
WebDriverWait(driver,10).until(EC.visibility_of_element_located((By.CSS_SELECTOR,"table#ctl00_ContentPlaceHolder1_GridViewmonthlypriceandarrivalreport")))
soup = BeautifulSoup(driver.page_source, 'html.parser')
table = pd.read_html(str(soup))[-1]
print(table)
Output:
S.No. CenterName ... Day30 Day31
0 1.0 AHMEDABAD / अहमदाबाद ... 1.002502e+15 2.005004e+15
1 NaN NaN ... NaN NaN
[2 rows x 35 columns]

Related

How to scrape data from each product page from Aliexpress using python selenium

I am trying to scrape each product page from this website: https://www.aliexpress.com/wholesale?catId=0&initiative_id=SB_20220315022920&SearchText=bluetooth+earphones
Especially I want to get comments and custumer countries as I mentionned in the photo:
enter image description here
The main issue is that my code does not inspect the right elements and this is what I am struggling with .
First, I tried my scraping on this product : https://www.aliexpress.com/item/1005003801507855.html?spm=a2g0o.productlist.0.0.1e951bc72xISfE&algo_pvid=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad&algo_exp_id=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad-8&pdp_ext_f=%7B%22sku_id%22%3A%2212000027213624098%22%7D&pdp_pi=-1%3B40.81%3B-1%3B-1%40salePrice%3BMAD%3Bsearch-mainSearch
Here is my code :
from selenium import webdriver
from selenium.webdriver.common.by import By
from lxml import html
import cssselect
from time import sleep
from itertools import zip_longest
import csv
driver = webdriver.Edge(executable_path=r"C:/Users/OUISSAL/Desktop/wscraping/XEW/scraping/codes/msedgedriver")
url = "https://www.aliexpress.com/item/1005003801507855.html?spm=a2g0o.productlist.0.0.1e951bc72xISfE&algo_pvid=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad&algo_exp_id=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad-8&pdp_ext_f=%7B%22sku_id%22%3A%2212000027213624098%22%7D&pdp_pi=-1%3B40.81%3B-1%3B-1%40salePrice%3BMAD%3Bsearch-mainSearch"
with open ("data.csv", "w", encoding="utf-8") as csvfile:
wr = csv.writer(csvfile)
wr.writerow(["Comment","Custumer country"])
driver.get(url)
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
review_buttom = driver.find_element_by_xpath('//li[#ae_button_type="tab_feedback"]')
review_buttom.click()
html_source = driver.find_element_by_xpath('//div[#id="transction-feedback"]')
tree = html.fromstring(html_source)
#tree = html.fromstring(driver.page_source)
for rvw in tree.xpath('//div[#class="feedback-item clearfix"]'):
country = rvw.xpath('//div[#class="user-country"]//b/text()')
if country:
country = country[0]
else:
country = ''
print('country:', country)
comment = rvw.xpath('//dt[#id="buyer-feedback"]//span/text()')
if comment:
comment = comment[0]
else:
comment = ''
print('comment:', comment)
driver.close()
Thank you !!
What happens?
There is one main issue, the feedback you are looking for is in an iframe, so you wont get your information by calling the elements directly.
How to fix?
Scroll into view of element that holds the iframe navigate to its source and interact with its pagination to get all the feedbacks.
Example
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
url = 'https://www.aliexpress.com/item/1005003801507855.html?spm=a2g0o.productlist.0.0.1e951bc72xISfE&algo_pvid=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad&algo_exp_id=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad-8&pdp_ext_f=%7B%22sku_id%22%3A%2212000027213624098%22%7D&pdp_pi=-1%3B40.81%3B-1%3B-1%40salePrice%3BMAD%3Bsearch-mainSearch'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
driver.get(url)
wait = WebDriverWait(driver, 10)
driver.execute_script("arguments[0].scrollIntoView();", wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.tab-content'))))
driver.get(wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#product-evaluation'))).get_attribute('src'))
data=[]
while True:
for e in driver.find_elements(By.CSS_SELECTOR, 'div.feedback-item'):
try:
country = e.find_element(By.CSS_SELECTOR, '.user-country > b').text
except:
country = None
try:
comment = e.find_element(By.CSS_SELECTOR, '.buyer-feedback span').text
except:
comment = None
data.append({
'country':country,
'comment':comment
})
try:
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#complex-pager a.ui-pagination-next'))).click()
except:
break
pd.DataFrame(data).to_csv('filename.csv',index=False)

Adding new rows to a pandas df on loop

I'm curious how to append or concat a pandas df with new data coming from a looped interation. I'm using selenium to view the web pages and BeautifulSoup to read the HTML. From there, I get a two tables of data per page. I am running this over multiple pages and I want to add the data from table 1 on page 2 to the table 1 on page 1, and the same for table 2 on both pages.
I think I need an append function on the df, but I am not exactly sure.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as soup
import pandas as pd
urls = ["https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2021/02/06","https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2021/02/10"]
datalist_races = [] #empty list
x = 0 #counter
datalist_results = [] #empty list
x = 0 #counter
for url in urls:
driver = webdriver.Chrome()
driver.get(url)
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CLASS_NAME, "f_fs13")))
htmlStr = driver.page_source
soup_level1 = soup(htmlStr, 'html.parser')
race_soup = soup_level1.find('tbody',{'class':'f_fs13'}).find_parent('table')
results_soup = soup_level1.find('tbody',{'class':'f_fs12'}).find_parent('table')
df_races = pd.read_html(str(race_soup))[0]
datalist_races.append(df_races[0])
df_results = pd.read_html(str(results_soup))[0]
datalist_results.append(df_results[0])
print(df_results)
driver.close()
Any insight would be wonderful. Reading through the comments and posts here, as well as watching YT videos, have left me no further ahead.
In your loop do this to any df you want to append:
df.loc[len(df.index)] = data_element
so for your case
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as soup
import pandas as pd
urls = ["https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2021/02/06","https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2021/02/10"]
datalist_races = [] #empty list
x = 0 #counter
datalist_results = [] #empty list
x = 0 #counter
for url in urls:
driver = webdriver.Chrome()
driver.get(url)
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CLASS_NAME, "f_fs13")))
htmlStr = driver.page_source
soup_level1 = soup(htmlStr, 'html.parser')
race_soup = soup_level1.find('tbody',{'class':'f_fs13'}).find_parent('table')
results_soup = soup_level1.find('tbody',{'class':'f_fs12'}).find_parent('table')
df_races = pd.read_html(str(race_soup))[0]
datalist_races.loc[len(datalist_races.index)] = df_races.loc[0]
df_results = pd.read_html(str(results_soup))[0]
datalist_results.loc[len(datalist_results.index)] = df_results.loc[0]
print(df_results)
driver.close()

Python selenium web scraped data to csv export

So i am working on a custom web scraper for any kind of ecommerce site, i want it to scrape names and prices of listings on a site and then export them to csv, but the problem is it exports only one line of (name, price) and it prints it on every line of csv, i couldnt find a good solution for this, i hope im not asking an extremely stupid thing, although i think the fix is easy. I hope someone will read my code and help me, thank you !
###imports
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
import pandas as pd
#driver path
driver = webdriver.Firefox(executable_path="D:\Programy\geckoDriver\geckodriver.exe")
#init + search
driver.get("https://pc.bazos.sk/pc/")
time.sleep(1)
nazov = driver.find_element_by_name("hledat")
nazov.send_keys("xeon")
cenamin = driver.find_element_by_name("cenaod")
cenamin.send_keys("")
cenamax = driver.find_element_by_name("cenado")
cenamax.send_keys("300")
driver.find_element_by_name("Submit").click()
##cookie acceptor
driver.find_element_by_xpath("/html/body/div[1]/button").click()
##main
x = 3
for i in range(x):
try:
main = WebDriverWait(driver, 7).until(
EC.presence_of_element_located((By.XPATH, "/html/body/div[1]/table/tbody/tr/td[2]"))
)
##find listings in table
inzeraty = main.find_elements_by_class_name("vypis")
for vypis in inzeraty:
nadpis = vypis.find_element_by_class_name("nadpis")
##print listings to check correctness
nadpist = nadpis.text
print(nadpist)
##find the price and print
for vypis in inzeraty:
cena = vypis.find_element_by_class_name("cena")
cenat = cena.text
print(cenat)
##export to csv - not working
time.sleep(1)
print("Writing to csv")
d = {"Nazov": [nadpist]*20*x,"Cena": [cenat]*20*x}
df = pd.DataFrame(data=d)
df.to_csv("bobo.csv")
time.sleep(1)
print("Writing to csv done !")
##next page
dalsia = driver.find_element_by_link_text("Ďalšia")
dalsia.click()
except:
driver.quit()
i want the csv to look like:
name,price
name2, price2
it would be great is the csv had only two columns and x rows depending on the number of listings
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
#driver path
driver = webdriver.Chrome()
#init + search
driver.get("https://pc.bazos.sk/pc/")
time.sleep(1)
nazov = driver.find_element_by_name("hledat")
nazov.send_keys("xeon")
cenamin = driver.find_element_by_name("cenaod")
cenamin.send_keys("")
cenamax = driver.find_element_by_name("cenado")
cenamax.send_keys("300")
driver.find_element_by_name("Submit").click()
##cookie acceptor
time.sleep(10)
driver.find_element_by_xpath("/html/body/div[1]/button").click()
##main
x = 3
d = []
for i in range(x):
try:
main = WebDriverWait(driver, 7).until(
EC.presence_of_element_located(
(By.XPATH, "/html/body/div[1]/table/tbody/tr/td[2]")))
##find listings in table
inzeraty = main.find_elements_by_class_name("vypis")
for vypis in inzeraty:
d.append({"Nazov": vypis.find_element_by_class_name("nadpis").text,
"Cena": vypis.find_element_by_class_name("cena").text
})
##next page
dalsia = driver.find_element_by_link_text("Ďalšia")
dalsia.click()
except:
driver.quit()
time.sleep(1)
print("Writing to csv")
df = pd.DataFrame(data=d)
df.to_csv("bobo.csv",index=False)
this gives me 59 items with price. first added to dict then to list, then send that to pandas.
All you need to do is create two empty lists nadpist_l, cenat_l and append data to that lists, finally save the lists as a dataframe.
UPDATED as per the comment
Check if this works
###imports
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
#driver path
driver = webdriver.Chrome()
#init + search
driver.get("https://pc.bazos.sk/pc/")
time.sleep(1)
nazov = driver.find_element_by_name("hledat")
nazov.send_keys("xeon")
cenamin = driver.find_element_by_name("cenaod")
cenamin.send_keys("")
cenamax = driver.find_element_by_name("cenado")
cenamax.send_keys("300")
driver.find_element_by_name("Submit").click()
##cookie acceptor
time.sleep(10)
driver.find_element_by_xpath("/html/body/div[1]/button").click()
##main
x = 3
d = {}
for i in range(x):
try:
main = WebDriverWait(driver, 7).until(
EC.presence_of_element_located(
(By.XPATH, "/html/body/div[1]/table/tbody/tr/td[2]")))
##find listings in table
inzeraty = main.find_elements_by_class_name("vypis")
nadpist_l = []
for vypis in inzeraty:
nadpis = vypis.find_element_by_class_name("nadpis")
##print listings to check correctness
nadpist = nadpis.text
nadpist_l.append(nadpist)
# print(nadpist)
##find the price and print
cenat_l = []
for vypis in inzeraty:
cena = vypis.find_element_by_class_name("cena")
cenat = cena.text
cenat_l.append(cenat)
print(len(cenat_l))
##export to csv - not working
d.update({"Nazov": [nadpist_l] * 20 * x, "Cena": [cenat_l] * 20 * x})
##next page
dalsia = driver.find_element_by_link_text("Ďalšia")
dalsia.click()
except:
driver.quit()
time.sleep(1)
print("Writing to csv")
df = pd.DataFrame(data=d)
df.to_csv("bobo.csv")
time.sleep(1)
print("Writing to csv done !")

How to extract all dynamic table data from a website using selenium?

I am new to web scraping. I am trying to extract table data from the Forbes Top Multinational Performers list. I was able to successfully extract some data. However, I only was able to get the top 10 from the list. The table contains ads in between. How can I get all the data?
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
driver = webdriver.Chrome(r'C:/Users/Shirly.Ang3/Desktop/BUSINESS STAT/GGR/chromedriver_win32/chromedriver.exe')
url = "https://www.forbes.com/top-multinational-performers/list/"
driver.get(url)
wait_row = WebDriverWait(driver, 30)
rows = wait_row.until(EC.presence_of_all_elements_located((By.XPATH,
'.//*[#id="the_list"]/tbody[#id="list-table-body"]')))
data = []
for row in rows:
for i in row.find_elements_by_class_name("data"):
try:
if i.is_displayed():
row_dict = {}
row_dict['Rank'] = i.find_element_by_xpath('.//td[2]').text
row_dict['Link'] = i.find_element_by_xpath('.//td[3]/a[#href]').get_attribute("href")
row_dict['Company'] = i.find_element_by_xpath('.//td[3]').text
row_dict['Industry'] = i.find_element_by_xpath('.//td[4]').text
row_dict['Country'] = i.find_element_by_xpath('.//td[5]').text
data.append(row_dict)
except:
continue
driver.close()
df = pd.DataFrame(data)
df.to_csv("Forbes_TEST.csv", sep=",", index=False)
To get all 250 records you just need to add code to scroll to the bottom of the page to your existing code. So add:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)
before:
data = []
and add import time
But saying that your code is really slow. Even with setting your wait_row to 3 it took 1m5.933s to run on my machine. The following code took 0m12.978s to run.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from bs4 import BeautifulSoup
import csv
driver = webdriver.Chrome(r'C:/Users/Shirly.Ang3/Desktop/BUSINESS STAT/GGR/chromedriver_win32/chromedriver.exe')
url = "https://www.forbes.com/top-multinational-performers/list/"
driver.get(url)
wait_row = WebDriverWait(driver, 3)
rows = wait_row.until(EC.presence_of_all_elements_located((By.XPATH, './/*[#id="the_list"]/tbody[#id="list-table-body"]')))
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)
ranks = []
links = []
companies = []
industries = []
countries = []
soup = BeautifulSoup(driver.page_source, "lxml")
table = soup.find("table", {"id": "the_list"})
for tr in table.find_all("tr", {"class": "data"}):
tds = tr.find_all("td")
ranks.append(tds[1].text)
links.append(tds[2].find('a')['href'])
companies.append(tds[2].text)
industries.append(tds[3].text)
countries.append(tds[4].text)
data = zip(ranks, links, companies, industries, countries)
with open('Forbes_TEST_02.csv', 'w') as csvfile:
csv_out = csv.writer(csvfile)
csv_out.writerow(['Rank', 'Link', 'Company','Industry', 'Country'])
csv_out.writerows(data)
driver.close()

Unable to retrieve data from Macro Trends using selenium and read_html to create a data frame?

I'm want to import data from macro trends into pandas data frame. From looking at the page source of the website it appears that data is in a jqxgrid.
I have tried using pandas/beautiful soup with the read_html function and no table was found. I am currently trying to use selenium to extract the data. I was hoping that if I could move the horizontal scroll bar table jqxgrid would be loaded and able to be extracted. However, that did not work.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
import time
driver = webdriver.Chrome()
driver.maximize_window()
driver.execute_script("window.location = 'http://www.macrotrends.net/stocks/charts/AMZN/amazon/income-statement?freq=Q';")
driver.implicitly_wait(2)
grid = driver.find_element_by_id('jqxgrid')
time.sleep(1)
driver.execute_script("window.scrollBy(0, 600);")
scrollbar = driver.find_element_by_id('jqxScrollThumbhorizontalScrollBarjqxgrid')
time.sleep(1)
actions = ActionChains(driver)
time.sleep(1)
for i in range(1,6):
actions.drag_and_drop_by_offset(scrollbar,i*70,0).perform()
time.sleep(1)
pd.read_html(grid.get_attribute('outerHTML'))
Error I get is:
ValueError: No tables found
I would expect table data from "http://www.macrotrends.net/stocks/charts/AMZN/amazon/income-statement?freq=Q" to be imported into a data frame
Here's an alternative that's quicker than selenium which has the headers as shown on page.
import requests
from bs4 import BeautifulSoup as bs
import re
import json
import pandas as pd
r = requests.get('https://www.macrotrends.net/stocks/charts/AMZN/amazon/income-statement?freq=Q')
p = re.compile(r' var originalData = (.*?);\r\n\r\n\r',re.DOTALL)
data = json.loads(p.findall(r.text)[0])
headers = list(data[0].keys())
headers.remove('popup_icon')
result = []
for row in data:
soup = bs(row['field_name'])
field_name = soup.select_one('a, span').text
fields = list(row.values())[2:]
fields.insert(0, field_name)
result.append(fields)
pd.option_context('display.max_rows', None, 'display.max_columns', None)
df = pd.DataFrame(result, columns = headers)
print(df.head())
The problem is the data is not in a table but 'div' elements. I'm not an expert on pandas but you can do it with BeautifulSoup.
Insert the line after your outher imports
from bs4 import BeautifulSoup
then change your last line for:
soup = BeautifulSoup(grid.get_attribute('outerHTML'), "html.parser")
divList = soup.findAll('div', {'role': 'row'})
data = [[x.text for x in div.findChildren('div', recursive=False)] for div in divList]
df = pd.DataFrame(data)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
print(df)
This finds all the 'div' elements with the attribute 'row'. Then reads the text elements for each div it finds under the 'div' elements with the attribute 'row' but only descends one level as some have multiple 'div' elements.
Output:
0 1 2 3 4 5 \
0 Revenue $4,135 $5,672 $3,262 $2,886
1 Cost Of Goods Sold $3,179 $4,501 $2,500 $2,185
2 Gross Profit $956 $1,171 $762 $701
3 Research And Development Expenses $234 $222 $209 $201
4 SG&A Expenses $518 $675 $427 $381
5 Other Operating Income Or Expenses $-6 $-3 $-3 $-3
...
6 7 8 9 10 11 12 13 14
0 $3,015 $3,986 $2,307 $2,139 $2,279 $2,977 $1,858 $1,753 $1,902
1 $2,296 $3,135 $1,758 $1,630 $1,732 $2,309 $1,395 $1,303 $1,444
2 $719 $851 $549 $509 $547 $668 $463 $450 $458
3 $186 $177 $172 $167 $146 $132 $121 $106 $92
4 $388 $476 $335 $292 $292 $367 $247 $238 $257
5 - $-2 $-2 $-3 $-3 $-4 $-40 $-2 $-1
...
However as you scroll across the page the items on the left are removed from the page source so that not all the data is scraped.
Updated in response to comment.
To set the column headers use:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
import time
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
driver.maximize_window()
driver.execute_script(
"window.location = 'http://www.macrotrends.net/stocks/charts/AMZN/amazon/income-statement?freq=Q';")
driver.implicitly_wait(2)
grid = driver.find_element_by_id('wrapperjqxgrid')
time.sleep(1)
driver.execute_script("window.scrollBy(0, 600);")
scrollbar = driver.find_element_by_id('jqxScrollThumbhorizontalScrollBarjqxgrid')
time.sleep(1)
actions = ActionChains(driver)
time.sleep(1)
for i in range(1, 6):
actions.drag_and_drop_by_offset(scrollbar, i * 70, 0).perform()
time.sleep(1)
soup = BeautifulSoup(grid.get_attribute('outerHTML'), "html.parser")
headersList = soup.findAll('div', {'role': 'columnheader'})
col_names=[h.text for h in headersList]
divList = soup.findAll('div', {'role': 'row'})
data = [[x.text for x in div.findChildren('div', recursive=False)] for div in divList]
df = pd.DataFrame(data, columns=col_names)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
print(df)
Outputs:
Quarterly Data | Millions of US $ except per share data 2008-03-31 \
0 Revenue $4,135
1 Cost Of Goods Sold $3,179
2 Gross Profit $956
...
2007-12-31 2007-09-30 2007-06-30 2007-03-31 2006-12-31 2006-09-30 \
0 $5,672 $3,262 $2,886 $3,015 $3,986 $2,307
1 $4,501 $2,500 $2,185 $2,296 $3,135 $1,758
...
2006-06-30 2006-03-31 2005-12-31 2005-09-30 2005-06-30 2005-03-31
0 $2,139 $2,279 $2,977 $1,858 $1,753 $1,902
1 $1,630 $1,732 $2,309 $1,395 $1,303 $1,444

Categories

Resources