Appending table values from a webpage to csv - python

I want to get the table from webpage
import os
from webdriver_manager.chrome import ChromeDriverManager
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--start-maximized')
options.page_load_strategy = 'eager'
options.add_argument("--headless");
driver = webdriver.Chrome(options=options)
wait = WebDriverWait(driver, 20)
driver.get("https://munafasutra.com/nse/dividends")
file_object = open('divident.csv', 'a')
output table
How to get the first table and their values?

You have to look at the HTML path and locate the WebElement that is gathering that first table (Clicking "Inspect" when you click right-button of the mouse can do the work).
You can save that webelement using the following line of code:
first_table = driver.find_element_by_xpath("//div[#id = 'co']//table[1]") # The [1] is not really necessary as when using **find_element_by_xpath** will only look for the first element.
Then, if you look at how data is organized inside that table, you can observe each row is gathered by a tr WebElement. Therefore, if you wish to write it in a csv file, I would suggest to write row by row with the following code:
rows = first_table.find_elements_by_xpath("./tbody/tr")
for row in rows:
entries_of_the_row = row.find_elements_by_xpath("./td")
row_to_csv = []
for entry in entries_of_the_row:
row_to_csv.append(entry.text)
file_object.write(f"{row_to_csv[0]}, {row_to_csv[1]}, {row_to_csv[2]}, {row_to_csv[3]}, {row_to_csv[4]}\n")
file_object.close()

You can use below XPATH to retrieve the first table value :
//h3[text()=' Earlier dividends announced by companies ']/preceding-sibling::table/descendant::td
Something like this :
driver.get("https://munafasutra.com/nse/dividends")
first_table = driver.find_elements(By.XPATH, "//h3[text()=' Earlier dividends announced by companies ']/preceding-sibling::table/descendant::td")
for first in first_table:
print(first.text)

You can use BeautifulSoup to get the table data. Selenium is not required if you just want to extract web page data.
You need to import below packages :
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
You can Extract HTML of the table using below code (soup variable will contain the HTML code of the entire page):
url_munafasutra = "https://munafasutra.com/nse/dividends"
html_munafasutra = urlopen(url_munafasutra)
soup = BeautifulSoup(html_munafasutra, 'html')
Below is the code to extract HTML for the 1st table (here table is the tag value and in [] contains the index of the table we want to extract data of):
first_table = soup.find_all('table')[0]
You can also add attributes to distinctly identify the table along with tag name.
Below is the code to extract all the rows in the selected table :
all_rows = first_table.findAll("tr")
Use the below code to write the data in csv file :
with open("C:\\Users\\abhay\\.spyder-py3\\table_extract.csv", "wt+", newline="") as f:
table_to_csv = csv.writer(f)
for row in all_rows:
row_data = []
for cell in row.findAll(["td", "th"]):
row_data.append(cell.get_text())
table_to_csv.writerow(row_data)
Below is the complete code to extract 1st table data to csv :
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
url_munafasutra = "https://munafasutra.com/nse/dividends"
html_munafasutra = urlopen(url_munafasutra)
soup = BeautifulSoup(html_munafasutra, 'html')
first_table = soup.find_all('table')[0]
all_rows = first_table.findAll("tr")
with open("C:\\Users\\abhay\\.spyder-py3\\table_extract.csv", "wt+", newline="") as f:
table_to_csv = csv.writer(f)
for row in all_rows:
row_data = []
for cell in row.findAll(["td", "th"]):
row_data.append(cell.get_text())
table_to_csv.writerow(row_data)

Related

How can I find the right xpath and loop over table?

I would like to get all the values from the table "Elektriciteit NL" on https://powerhouse.net/forecast-prijzen-onbalans/. However after endlessly trying to find the right xpath using selenium I was not able to scrape the table.
I tried to use "inspect" and copy the xpath from the table to identify the length of the table for scraping later. After this failed I tried to use "contain" however this was not succesfull either. Afterwards i tried some things using BeautifullSoup however without any luck.
#%%
import pandas as pd
from selenium import webdriver
import pandas as pd
#%% powerhouse Elektriciteit NL base & peak
url = "https://powerhouse.net/forecast-prijzen-onbalans/"
#%% open webpagina
driver = webdriver.Chrome(executable_path = path + 'chromedriver.exe')
driver.get(url)
#%%
prices = []
#loop for values in table
for j in range(len(driver.find_elements_by_xpath('//tr[#id="endex_nl_forecast"]/div[3]/table/tbody/tr[1]/td[4]'))):
base = driver.find_elements_by_xpath('//tr[#id="endex_nl_forecast"]/div[3]/table/tbody/tr[1]/td[4]')[j]
#%%
#trying with BeautifulSoup
from bs4 import BeautifulSoup
import requests
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
table = soup.find('table', id = 'endex_nl_forecast')
rows = soup.find_all('tr')
I would like to have the table in a dataframe and understand how xpath exactly works. I'm kind of new to the whole concept.
If you are open to ways other than xpath you could do this without selenium or xpath:
you could just use pandas
import pandas as pd
table = pd.read_html('https://powerhouse.net/forecast-prijzen-onbalans/')[4]
If you want text representation of icons you could extract the class name of the svg which describes arrow direction from the appropriate tds.
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
r = requests.get('https://powerhouse.net/forecast-prijzen-onbalans/')
soup = bs(r.content, 'lxml')
table = soup.select_one('#endex_nl_forecast table')
rows = []
headers = [i.text for i in table.select('th')]
for tr in table.select('tr')[1:]:
rows.append([i.text if i.svg is None else i.svg['class'][2].split('-')[-1] for i in tr.select('td') ])
df = pd.DataFrame(rows, columns = headers)
print(df)
Sample rows:
You can use Selenium driver to locate the table & its contents,
url = 'https://powerhouse.net/forecast-prijzen-onbalans/'
driver.get(url)
time.sleep(3)
To Read Table Headers & Print
tableHeader = driver.find_elements_by_xpath("//*[#id='endex_nl_forecast']//thead//th")
print(tableHeader)
for header in tableHeader:
print(header.text)
To Find number of rows in the table
rowElements = driver.find_elements_by_xpath("//*[#id='endex_nl_forecast']//tbody/tr")
print('Total rows in the table:', len(rowElements))
To print each rows as is
for row in rowElements:
print(row.text)

To get the columns of the table also the first column containing link by clicking that link to get the data

I have the below link
http://www.igrmaharashtra.gov.in/eASR/eASRCommon.aspx?hDistName=Pune
In this i want to scrape data in proper format in excel.The SurveyNo link contains the data when it is click i want the row-wise data with the data on clicking the survey number.
Also want the format that i have attached in the image (desired output in excel)
import urllib.request
from bs4 import BeautifulSoup
import csv
import os
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys
import time
url = 'http://www.igrmaharashtra.gov.in/eASR/eASRCommon.aspx?
hDistName=Pune'
chrome_path =r'C:/Users/User/AppData/Local/Programs/Python/Python36/Scripts/chromedriver.exe'
driver = webdriver.Chrome(executable_path=chrome_path)
driver.implicitly_wait(10)
driver.get(url)
Select(driver.find_element_by_name('ctl00$ContentPlaceHolder5$ddlTaluka')).select_by_value('5')
Select(driver.find_element_by_name('ctl00$ContentPlaceHolder5$ddlVillage')).select_by_value('1872')
soup=BeautifulSoup(driver.page_source, 'lxml')
table = soup.find("table" , attrs = {'id':'ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate' })
with open('Baner.csv', 'w',encoding='utf-16',newline='') as csvfile:
f = csv.writer(csvfile, dialect='excel')
f.writerow(['SurveyNo','Subdivision', 'Open ground', 'Resident house','Offices','Shops','Industrial','Unit (Rs./)']) # headers
rows = table.find_all('tr')[1:]
data=[]
for tr in rows:
cols = tr.find_all('td')
for td in cols:
links = driver.find_elements_by_link_text('SurveyNo')
l =len(links)
data12 =[]
for i in range(l):
newlinks = driver.find_elements_by_link_text('SurveyNo')
newlinks[i].click()
soup = BeautifulSoup(driver.page_source, 'lxml')
td1 = soup.find("textarea", attrs={'class': 'textbox'})
data12.append(td1.text)
data.append(td.text)
data.append(data12)
print(data)
Please find the image. In that format I required the output of scrape data.
You could do the following and simply re-arrange columns at end along with desired renaming. There is the assumption SurveyNo exists for all wanted rows. I extract the hrefs from the SurveyNo cells which are actually executable strings you can pass to execute_script to show the survey numbers without worrying about stale element etc....
from selenium import webdriver
import pandas as pd
url = 'http://www.igrmaharashtra.gov.in/eASR/eASRCommon.aspx?hDistName=Pune'
d = webdriver.Chrome()
d.get(url)
d.find_element_by_css_selector('[value="5"]').click()
d.find_element_by_css_selector('[value="1872"]').click()
tableElement = d.find_element_by_id('ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate')
table = pd.read_html(tableElement.get_attribute('outerHTML'))[0]
table.columns = table.iloc[0]
table = table.iloc[1:]
table = table[table.Select == 'SurveyNo'] #assumption SurveyNo exists for all wanted rows
surveyNo_scripts = [item.get_attribute('href') for item in d.find_elements_by_css_selector("#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate [href*='Select$']")]
i = 0
for script in surveyNo_scripts:
d.execute_script(script)
surveys = d.find_element_by_css_selector('textarea').text
table.iloc[i]['Select'] = surveys
i+=1
print(table)
#rename and re-order columns as required
table.to_csv(r"C:\Users\User\Desktop\Data.csv", sep=',', encoding='utf-8-sig',index = False )
Output before rename and re-order:
In a loop you can concat all dfs and then write out in one go (my preference - shown here) or later append as shown here

Cannot extract the html table

I want to harvest information using beautiful soup and python3 from a table within a given website .
I have also tried to use XPath method but still cannot get a way to obtain the data.
coaches = 'https://www.badmintonengland.co.uk/coach/find-a-coach'
coachespage = urlopen(coaches)
soup = BeautifulSoup(coachespage,features="html.parser")
data = soup.find_all("tbody", { "id" : "JGrid-az-com-1031-tbody" })
def crawler(table):
for mytable in table:
try:
rows = mytable.find_all('tr')
for tr in rows:
cols = tr.find_all('td')
for td in cols:
return(td.text)
except:
raise ValueError("no data")
print(crawler(data))
If you use selenium to make the selections and then pd.read_html the page_source to get the table, this allows javascript to run and populate values
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
url = 'https://www.badmintonengland.co.uk/coach/find-a-coach'
driver = webdriver.Chrome()
driver.get(url)
ele = driver.find_element_by_css_selector('.az-triggers-panel a') #distance dropdown
driver.execute_script("arguments[0].scrollIntoView();", ele)
ele.click()
option = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.ID, "comboOption-az-com-1015-8"))) # any distance
option.click()
driver.find_element_by_css_selector('.az-btn-text').click()
time.sleep(5) #seek better wait condition for page update
tables = pd.read_html(driver.page_source)

How to retrieve table from dynamic website python selenium

I want to retrieve all the information from a table on a dynamic website and I have the following code for it:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
import sys
reload(sys)
import re
import csv
from time import sleep
sys.setdefaultencoding('utf-8') #added since it would give error for certain values when using str(i)
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
prefs = {'profile.managed_default_content_settings.images':2}
chrome_options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(chrome_options=chrome_options)
maxcr = 1379
listofrows = []
url = "http://biggestbook.com/ui/catalog.html#/itemDetail?itemId=HERY4832YER01&uom=CT"
print(url)
driver.get(url)
wait = WebDriverWait(driver,10)
# Trying to get the table
tableloadwait = (wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".panel-body"))))
table = driver.find_elements_by_css_selector(".panel-body")
print(table)
RowsOfTable = table.get_attribute("tr")
However, I keep getting error but it doesn't work so far. How do I retrieve the information of the table?
Thanks a lot!
error:
RowsOfTable = table.get_attribute("tr")
AttributeError: 'list' object has no attribute 'get_attribute'
Here is the code to get the product details
tableloadwait = (wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".panel-body"))))
driver.find_element_by_xpath("//span[contains(.,'Product Details')]").click()
rows = driver.find_elements_by_xpath("//span[contains(.,'Product Details')]/ancestor::div[#class='accordion-top-border']//tr[(#ng-repeat='attr in attributes' or #ng-repeat='field in fields') and #class='visible-xs']")
for rowNum in range(len(rows)):
print(rows[rowNum].get_attribute('innerText'))
driver.quit()
We have to trim the values or break the values as per your requirement.
if you would like to get the data based on row text use the below.
upcData = driver.find_element_by_xpath("//strong[.='UPC']/parent::td").get_attribute('innerText').replace('UPC','').replace('\n','').replace(' ','')
Expand the accordion with the appropriate + button first then select the table. Add waits for items to be present. Change the expandSigns index to 2 if you want the other table.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
url = 'http://biggestbook.com/ui/catalog.html#/itemDetail?itemId=HERY4832YER01&uom=CT'
driver = webdriver.Chrome()
driver.get(url)
expandSigns = WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".glyphicon-plus")))
expandSigns[1].click()
WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "td")))
table = driver.find_element_by_css_selector('table')
html = table.get_attribute('outerHTML')
df = pd.read_html(html)
print(df)
driver.quit()
If you need to scrape, not test, you can use requests to get data. Below code is example how you can get data from the page.
import requests
import re
# Return header page(html) to get token and list key
response = requests.get("http://biggestbook.com/ui/catalog.html#/itemDetail?itemId=HERY4832YER01&uom=CT")
# Get token using regular expression
productRecommToken = re.search("'productRecommToken','(.+)'", response.text)[1]
# Get list of keys using regular expression
listKey = re.search("'listKey',\\['(.*?)'\\]", response.text)[1].split("','")
# Create header with token
headers = {
'Accept': 'application/json, text/plain, */*',
'Referer': 'http://biggestbook.com/ui/catalog.html',
'Origin': 'http://biggestbook.com',
'DNT': '1',
'token': productRecommToken,
'BiggestBook-Handle-Errors-Generically': 'true',
}
# Create parameters with list keys and search values
params = (
('listKey', listKey),
('uom', 'CT'),
('vc', 'n'),
('win', 'HERY4832YER01'),
)
# Return json with all details about product
response = requests.get('https://api.essendant.com/digital/digitalservices/search/v1/items',
headers=headers,
params=params)
data = response.json()
# Get items from json, probably could be more than one
items = data["items"]
# Iterate and get details you need. Check "data" to see all possible details you can get
for i in items:
print(i["manufacturer"])
print(i["description"])
print(i["actualPrice"])
# Get attributes
attributes = i["attributes"]
# Example hot you can get specific one attribute.
thickness = list(filter(lambda d: d['name'] == 'Thickness', attributes))[0]["value"]
# Print all attributes as name = value
for a in attributes:
print(f"{a['name']} = {a['value']}")

How to append a new string element in python?

I am web-scraping 2 table from 2 different sites.
I want to append a new column (called WHEREFROM in the header) with a web-scraping text, in my code i called it "name".
My code is here:
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import urllib2
import unicodecsv as csv
import os
import sys
import io
import time
import datetime
import pandas as pd
from bs4 import BeautifulSoup
import re
import contextlib
import selenium.webdriver.support.ui as ui
filename=r'output.csv'
resultcsv=open(filename,"wb")
output=csv.writer(resultcsv, delimiter=';',quotechar = '"', quoting=csv.QUOTE_NONNUMERIC, encoding='latin-1')
output.writerow(['TIME','FLIGHT','FROM','AIRLANE','AIRCRAFT','STATUS','WHEREFROM', 'ACTUALDATE'])
def scrape(urls):
browser = webdriver.Firefox()
for url in urls:
browser.get(url)
html = browser.page_source
soup=BeautifulSoup(html,"html.parser")
table = soup.find('table', { "class" : "table table-condensed table-hover data-table m-n-t-15" })
soup2=BeautifulSoup(html,"html.parser")
name = soup2.find('div' , attrs={'class' : 'row m-t-l m-l-l'})
datatable=[]
for record in table.find_all('tr', class_="hidden-xs hidden-sm ng-scope"):
temp_data = []
for data in record.find_all("td"):
temp_data.append(data.text.encode('latin-1'))
newlist = filter(None, temp_data)
datatable.append(newlist)
print name
output.writerows(datatable)
resultcsv.close()
time.sleep(10)
browser.close()
urls = ["https://www.flightradar24.com/data/airports/bud/arrivals", "https://www.flightradar24.com/data/airports/fco/arrivals"]
scrape(urls)
resultcsv.close()
How can I do this in a loop, and how can I do this correctly? Because after that I am writing these data to csv, where the delimiter is ; .
But after web-scraping tables there isn't any ; in the last text, so I think I have to insert a ; in this last text too?!
I am talking about this:
"1:15 PM";" KL1975";"Amsterdam (AMS)-";"KLM";"B737 (PH-BGT) ";"Landed 1:01 PM"
EDITED with the actual date (not working, format issue):
df = pd.DataFrame(newlist)
now = time.strftime('%d-%m-%Y')
df['ACTUALDATE'] = now
#df.rows = header
df.to_csv('output.csv', sep=';', encoding='latin-1', index=False)
I wrote it in the loop, to see the actual date (hours-minutes too, but this is only the day)
It seems so trivial that I'm not even sure I really understood the question... If what you want is to add name as the last element of each row in your csv, all you have to do is, well, to add it as the last element of the rows you're passing to your csv writer:
for record in table.find_all('tr', class_="hidden-xs hidden-sm ng-scope"):
temp_data = []
for data in record.find_all("td"):
temp_data.append(data.text.encode('latin-1'))
# here
temp_data.append(name)

Categories

Resources