I want to retrieve all the information from a table on a dynamic website and I have the following code for it:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
import sys
reload(sys)
import re
import csv
from time import sleep
sys.setdefaultencoding('utf-8') #added since it would give error for certain values when using str(i)
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
prefs = {'profile.managed_default_content_settings.images':2}
chrome_options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(chrome_options=chrome_options)
maxcr = 1379
listofrows = []
url = "http://biggestbook.com/ui/catalog.html#/itemDetail?itemId=HERY4832YER01&uom=CT"
print(url)
driver.get(url)
wait = WebDriverWait(driver,10)
# Trying to get the table
tableloadwait = (wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".panel-body"))))
table = driver.find_elements_by_css_selector(".panel-body")
print(table)
RowsOfTable = table.get_attribute("tr")
However, I keep getting error but it doesn't work so far. How do I retrieve the information of the table?
Thanks a lot!
error:
RowsOfTable = table.get_attribute("tr")
AttributeError: 'list' object has no attribute 'get_attribute'
Here is the code to get the product details
tableloadwait = (wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".panel-body"))))
driver.find_element_by_xpath("//span[contains(.,'Product Details')]").click()
rows = driver.find_elements_by_xpath("//span[contains(.,'Product Details')]/ancestor::div[#class='accordion-top-border']//tr[(#ng-repeat='attr in attributes' or #ng-repeat='field in fields') and #class='visible-xs']")
for rowNum in range(len(rows)):
print(rows[rowNum].get_attribute('innerText'))
driver.quit()
We have to trim the values or break the values as per your requirement.
if you would like to get the data based on row text use the below.
upcData = driver.find_element_by_xpath("//strong[.='UPC']/parent::td").get_attribute('innerText').replace('UPC','').replace('\n','').replace(' ','')
Expand the accordion with the appropriate + button first then select the table. Add waits for items to be present. Change the expandSigns index to 2 if you want the other table.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
url = 'http://biggestbook.com/ui/catalog.html#/itemDetail?itemId=HERY4832YER01&uom=CT'
driver = webdriver.Chrome()
driver.get(url)
expandSigns = WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".glyphicon-plus")))
expandSigns[1].click()
WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "td")))
table = driver.find_element_by_css_selector('table')
html = table.get_attribute('outerHTML')
df = pd.read_html(html)
print(df)
driver.quit()
If you need to scrape, not test, you can use requests to get data. Below code is example how you can get data from the page.
import requests
import re
# Return header page(html) to get token and list key
response = requests.get("http://biggestbook.com/ui/catalog.html#/itemDetail?itemId=HERY4832YER01&uom=CT")
# Get token using regular expression
productRecommToken = re.search("'productRecommToken','(.+)'", response.text)[1]
# Get list of keys using regular expression
listKey = re.search("'listKey',\\['(.*?)'\\]", response.text)[1].split("','")
# Create header with token
headers = {
'Accept': 'application/json, text/plain, */*',
'Referer': 'http://biggestbook.com/ui/catalog.html',
'Origin': 'http://biggestbook.com',
'DNT': '1',
'token': productRecommToken,
'BiggestBook-Handle-Errors-Generically': 'true',
}
# Create parameters with list keys and search values
params = (
('listKey', listKey),
('uom', 'CT'),
('vc', 'n'),
('win', 'HERY4832YER01'),
)
# Return json with all details about product
response = requests.get('https://api.essendant.com/digital/digitalservices/search/v1/items',
headers=headers,
params=params)
data = response.json()
# Get items from json, probably could be more than one
items = data["items"]
# Iterate and get details you need. Check "data" to see all possible details you can get
for i in items:
print(i["manufacturer"])
print(i["description"])
print(i["actualPrice"])
# Get attributes
attributes = i["attributes"]
# Example hot you can get specific one attribute.
thickness = list(filter(lambda d: d['name'] == 'Thickness', attributes))[0]["value"]
# Print all attributes as name = value
for a in attributes:
print(f"{a['name']} = {a['value']}")
Related
I did a project, where I looped over every single student ID I have in my college to get the results of each individual student to create analytical dashboard for each student and send their results to them via Email with a nice report done later. I scraped the website our college uploads our results in.
the code for it was this:
#Importing The Neccessary modules
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.options import Options
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
#Reading the data
our_ids = pd.read_excel("All Our IDs.xlsx")
total_students = our_ids.shape[0]
df_to_hold_all_data = pd.DataFrame()
#Defining Functions to use in the script
def make_request(student_id):
"""
Makes a response for the student ID given, Keeps repeating it till it's a successful response.
"""
url = 'http://app1.helwan.edu.eg/Commerce/HasasnUpMlist.asp' #Base URL to our college website
params = {
'z_dep': '=',
'z_st_name': 'LIKE',
'z_st_settingno': '=',
'x_st_settingno': f'{student_id}',
'x_st_name': '',
'z_gro': '=',
'x_gro': '',
'x_dep': '',
'z_sec': 'LIKE',
'x_sec': '',
'Submit': '++++حفظ++++'
}
response_state = 0
while response_state != 200 :
try:
response = requests.get(url,params= params, timeout= 10 )
except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectTimeout):
print("Requesting Again...")
continue
response_state = response.status_code
return response
def make_the_second_request_with_selenium(link):
# Create a headless Edge driver
options = Options()
options.add_argument('--headless')
driver = webdriver.Edge(options=options)
# Set timeout for the request and try to navigate to a website
timeout = 10 # seconds
try:
driver.get(link)
WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.XPATH,'/html/body/form/div/table[1]/tbody/tr[3]/td[2]/div/font/b')))
return driver # Will Eventually return this.
except (TimeoutException, NoSuchElementException): # If the request takes more than 10 seconds or the request failed for any reason, repeat the request again
print("Requesting Again...")
make_the_second_request_with_selenium(link)
this_loop = 0
#Looping for all students
for student_id in our_ids['IDS'].unique():
print(f"\nNow Looping for {student_id}\n")
response = make_request(student_id) # Making our response
print(f"{response.status_code}")
# Parse the response and create a BeautifulSoup object
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a',{'href': True})
link_to_natega = ''
for link in links:
if "StdCode" in link['href']:
# get the link we want to go to eventually, Each Student has a unique link.
link_to_natega = f"http://app1.helwan.edu.eg/Commerce/{link['href']}"
print(link_to_natega)
try:
driver = make_the_second_request_with_selenium(link_to_natega)
name = driver.find_element(By.XPATH,'/html/body/form/div/table[1]/tbody/tr[3]/td[2]/div/font/b').text
id_of_student = driver.find_element(By.XPATH,'/html/body/form/div/table[1]/tbody/tr[3]/td[4]/div/font/b').text
department = driver.find_element(By.XPATH,'/html/body/form/div/table[1]/tbody/tr[5]/td[2]/div/font/b').text
first_sub = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[3]/td[2]/div/font/b').text
first_sub_score = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[3]/td[4]/div/font/b').text
second_sub = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[4]/td[2]/div/font/b').text
second_sub_score = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[4]/td[4]/div/font/b').text
third_sub = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[5]/td[2]/div/font/b').text
third_sub_score = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[5]/td[4]/div/font/b').text
fourth_sub = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[6]/td[2]/div/font/b').text
fourth_sub_score = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[6]/td[4]/div/font/b').text
fifth_sub = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[7]/td[2]/div/font/b').text
fifth_sub_score = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[7]/td[4]/div/font/b').text
sixth_sub = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[8]/td[2]/div/font/b').text
sixth_sub_score = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[8]/td[4]/div/font/b').text
data = {'name': name , 'ID' : id_of_student , "Department" : department , \
"Subject" : [first_sub,second_sub,third_sub,fourth_sub,fifth_sub,sixth_sub],\
"Score": [first_sub_score,second_sub_score,third_sub_score,fourth_sub_score,fifth_sub_score,sixth_sub_score]
}
df = pd.DataFrame(data) #Create a DataFrame
df_to_hold_all_data = df_to_hold_all_data.append(df) # Append it to the dataframe we created above.
# Close the driver
driver.quit()
print(f"The shape of the data now is: {df_to_hold_all_data.shape}")
except:
print(f'failed to get data for {student_id}')
this_loop += 1
remaining_students = total_students - this_loop
print(f'Done Looping For {student_id} The remaining students: {remaining_students}')
df_to_hold_all_data.to_excel("All Our Results.xlsx",index=False)
I don't know if it's possible to create this with scrapy?
If yes, How much would it make the process faster?
Is it worth investing the time and effort to learn it and rewrite the code again?
edit: Sorry for poor structure, Data Analysis and statistics is the part where I am actually good :D
Your help would be appreciated.
I have a simple question that assumingly can be solved very easily.
I however used some time now to extract the four lines of information as shown here:
see html structure here
I first try to access the <ul _ngcontent-xl-byg-c79="" class="short ng-star-inserted" item to then loop over the <li _ngcontent-xl-byg-c79="" class="table-row ng-star-inserted"> items in order to store the embedded information in my dataframe (columns are 'Mærke', 'Produkttype', 'Serie', and 'Model').
What do I do wrong? My problem is that the four lines have the same "class" name, which gives me the same output in all four loops.
This is my code:
from selenium import webdriver
import pandas as pd
# Activate web browser: External control
browser = webdriver.Chrome(r'C:\Users\KristerJens\Downloads\chromedriver_win32\chromedriver')
# Get webpage
browser.get("https://www.xl-byg.dk/shop/knauf-insulation-ecobatt-murfilt-190-mm-2255993")
# Get information
brand= []
product= []
series=[]
model=[]
for i in browser.find_elements_by_xpath("//ul[#class='short ng-star-inserted']/li"):
for p in i.find_elements_by_xpath("//span[#class='attribute-name']"):
brand.append(i.find_elements_by_class_name('?').text)
product.append(i.find_elements_by_class_name('?').text)
series.append(i.find_elements_by_class_name('?').text)
model.append(i.find_elements_by_class_name('?').text)
df = pd.DataFrame()
df['brand'] = brand
df['product'] = product
df['series'] = series
df['model'] = model
Any help is very appreciated!!
Try like below and confirm:
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
driver = webdriver.Chrome(executable_path="path to chromedriver.exe")
driver.maximize_window()
driver.implicitly_wait(10)
driver.get("https://www.xl-byg.dk/shop/knauf-insulation-ecobatt-murfilt-190-mm-2255993")
wait = WebDriverWait(driver,30)
# Cookie pop-up
wait.until(EC.element_to_be_clickable((By.XPATH,"//button[#aria-label='Accept all' or #aria-label = 'Accepter alle']"))).click()
options = driver.find_elements_by_xpath("//div[#class='row-column']//ul[contains(#class,'short')]/li")
for opt in options:
attribute = opt.find_element_by_xpath("./span[#class='attribute-name']").text # Use a "." in the xpath to find element within in an element
value = opt.find_element_by_xpath("./*[contains(#class,'ng-star-inserted')]").text
print(f"{attribute} : {value}")
Mærke : Knauf Insulation
Produkttype : Murfilt
Serie : ECOBATT
Materiale : Glasmineraluld
I want to get the table from webpage
import os
from webdriver_manager.chrome import ChromeDriverManager
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--start-maximized')
options.page_load_strategy = 'eager'
options.add_argument("--headless");
driver = webdriver.Chrome(options=options)
wait = WebDriverWait(driver, 20)
driver.get("https://munafasutra.com/nse/dividends")
file_object = open('divident.csv', 'a')
output table
How to get the first table and their values?
You have to look at the HTML path and locate the WebElement that is gathering that first table (Clicking "Inspect" when you click right-button of the mouse can do the work).
You can save that webelement using the following line of code:
first_table = driver.find_element_by_xpath("//div[#id = 'co']//table[1]") # The [1] is not really necessary as when using **find_element_by_xpath** will only look for the first element.
Then, if you look at how data is organized inside that table, you can observe each row is gathered by a tr WebElement. Therefore, if you wish to write it in a csv file, I would suggest to write row by row with the following code:
rows = first_table.find_elements_by_xpath("./tbody/tr")
for row in rows:
entries_of_the_row = row.find_elements_by_xpath("./td")
row_to_csv = []
for entry in entries_of_the_row:
row_to_csv.append(entry.text)
file_object.write(f"{row_to_csv[0]}, {row_to_csv[1]}, {row_to_csv[2]}, {row_to_csv[3]}, {row_to_csv[4]}\n")
file_object.close()
You can use below XPATH to retrieve the first table value :
//h3[text()=' Earlier dividends announced by companies ']/preceding-sibling::table/descendant::td
Something like this :
driver.get("https://munafasutra.com/nse/dividends")
first_table = driver.find_elements(By.XPATH, "//h3[text()=' Earlier dividends announced by companies ']/preceding-sibling::table/descendant::td")
for first in first_table:
print(first.text)
You can use BeautifulSoup to get the table data. Selenium is not required if you just want to extract web page data.
You need to import below packages :
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
You can Extract HTML of the table using below code (soup variable will contain the HTML code of the entire page):
url_munafasutra = "https://munafasutra.com/nse/dividends"
html_munafasutra = urlopen(url_munafasutra)
soup = BeautifulSoup(html_munafasutra, 'html')
Below is the code to extract HTML for the 1st table (here table is the tag value and in [] contains the index of the table we want to extract data of):
first_table = soup.find_all('table')[0]
You can also add attributes to distinctly identify the table along with tag name.
Below is the code to extract all the rows in the selected table :
all_rows = first_table.findAll("tr")
Use the below code to write the data in csv file :
with open("C:\\Users\\abhay\\.spyder-py3\\table_extract.csv", "wt+", newline="") as f:
table_to_csv = csv.writer(f)
for row in all_rows:
row_data = []
for cell in row.findAll(["td", "th"]):
row_data.append(cell.get_text())
table_to_csv.writerow(row_data)
Below is the complete code to extract 1st table data to csv :
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
url_munafasutra = "https://munafasutra.com/nse/dividends"
html_munafasutra = urlopen(url_munafasutra)
soup = BeautifulSoup(html_munafasutra, 'html')
first_table = soup.find_all('table')[0]
all_rows = first_table.findAll("tr")
with open("C:\\Users\\abhay\\.spyder-py3\\table_extract.csv", "wt+", newline="") as f:
table_to_csv = csv.writer(f)
for row in all_rows:
row_data = []
for cell in row.findAll(["td", "th"]):
row_data.append(cell.get_text())
table_to_csv.writerow(row_data)
I want to harvest information using beautiful soup and python3 from a table within a given website .
I have also tried to use XPath method but still cannot get a way to obtain the data.
coaches = 'https://www.badmintonengland.co.uk/coach/find-a-coach'
coachespage = urlopen(coaches)
soup = BeautifulSoup(coachespage,features="html.parser")
data = soup.find_all("tbody", { "id" : "JGrid-az-com-1031-tbody" })
def crawler(table):
for mytable in table:
try:
rows = mytable.find_all('tr')
for tr in rows:
cols = tr.find_all('td')
for td in cols:
return(td.text)
except:
raise ValueError("no data")
print(crawler(data))
If you use selenium to make the selections and then pd.read_html the page_source to get the table, this allows javascript to run and populate values
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
url = 'https://www.badmintonengland.co.uk/coach/find-a-coach'
driver = webdriver.Chrome()
driver.get(url)
ele = driver.find_element_by_css_selector('.az-triggers-panel a') #distance dropdown
driver.execute_script("arguments[0].scrollIntoView();", ele)
ele.click()
option = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.ID, "comboOption-az-com-1015-8"))) # any distance
option.click()
driver.find_element_by_css_selector('.az-btn-text').click()
time.sleep(5) #seek better wait condition for page update
tables = pd.read_html(driver.page_source)
I am a newbie with Python and trying to retrieve data within
this Site using Python version 3.6.0
There are 2 dropdowns and second's data depends on the first's selection.
First: 'Organizasyon Adi'
Second: 'UEVCB Adi'
All options from the source is like:
<option value="0" selected="selected">TÜMÜ</option> #this is default value when we open the page
<option value="10374">1461 TRABZON ELEKTRİK ÜRETİM A.Ş</option>
<option value="9426">2M ELEKTRİK ÜRETİM SANAYİ VE TİCARET ANONİM ŞİRKETİ</option>
These are options for firs Dropdown and there are almost 800 options.
We cant see the second Dropdowns options without inspecting the page unless the second Dropdown box is clicked. (Both dropdowns opens a searchbox when clicked.)
Second Dropdown opens a list of units for selected organisation.
When options from two Dropdowns are selected it generates a table data and we're trying to get data for all units.
I couldn't make it to scrap data for all units with one program, so i decided to scrap them individually.
With this code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.action_chains import ActionChains
import time
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://seffaflik.epias.com.tr/transparency/uretim/planlama/kgup.xhtml' #
driver = webdriver.Chrome()
driver.get(url)
time.sleep(3)
organisation = driver.find_element_by_xpath(".//*[#id='j_idt102:distributionId_label']")
organisation.click()
dropdown1 = driver.find_element_by_xpath(".//*[#id='j_idt102:distributionId_filter']")
dropdown1.send_keys('1461')
dropdown1.send_keys(u'\ue007')
unit = driver.find_element_by_id('j_idt102:uevcb_label')
dropdown2 = driver.find_element_by_xpath(".//*[#id='j_idt102:uevcb_filter']")
dropdown2.send_keys('SAMA')
dropdown2.send_keys(u'\ue007')
apply= driver.find_element_by_xpath("//*[#id='j_idt102:goster']")
apply.click()
time.sleep(5)
soup = BeautifulSoup(driver.page_source)
table = soup.find_all('table')[0]
rows = table.find_all('tr')[1:]
data = {
'01.Date' : [],
'02.Hour' : [],
'03.NaturalGas' : [],
'04.Wind' : [],
'05.Lignite' : [],
'06.Hard_Coal' : [],
'07.ImportedCoal' : [],
'08.Geothermal' : [],
'09.Hydro_Dam' : [],
'10.Naphta' : [],
'11.Biomass' : [],
'12.River' : [],
'13.Other' : []
}
for row in rows:
cols = row.find_all('td')
data['01.Date'].append( cols[0].get_text() )
data['02.Hour'].append( cols[1].get_text() )
data['03.NaturalGas'].append( cols[3].get_text() )
data['04.Wind'].append( cols[4].get_text() )
data['05.Lignite'].append( cols[5].get_text() )
data['06.Hard_Coal'].append( cols[6].get_text() )
data['07.ImportedCoal'].append( cols[7].get_text() )
data['08.Geothermal'].append( cols[8].get_text() )
data['09.Hydro_Dam'].append( cols[9].get_text() )
data['10.Naphta'].append( cols[10].get_text() )
data['11.Biomass'].append( cols[11].get_text() )
data['12.River'].append( cols[12].get_text() )
data['13.Other'].append( cols[13].get_text() )
df = pd.DataFrame( data )
writer = pd.ExcelWriter('//192.168.0.102/Data/kgup.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1')
writer.save()
time.sleep(5)
driver.close()
By this code we can select from first dropdown using search function and Enter key.
When it comes to second, it generates ImportError: sys.meta_path is None, Python is likely shutting down
How should I handle this?
Thanks.
Your code seem to be sensitive to StaleElementException as well as to exception Element is not clickable at point.... Try below code for web-scraping part and let me know the result:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://seffaflik.epias.com.tr/transparency/uretim/planlama/kgup.xhtml' #
driver = webdriver.Chrome()
driver.get(url)
wait = WebDriverWait(driver, 20)
driver.maximize_window()
wait.until_not(EC.visibility_of_element_located((By.ID,'j_idt15'))) # wait until modal disappeared
wait.until(EC.element_to_be_clickable((By.ID,'j_idt102:distributionId_label'))).click() # organization drop-down
wait.until(EC.element_to_be_clickable((By.ID, 'j_idt102:distributionId_filter'))).send_keys('1461' + u'\ue007') # select required
wait.until_not(EC.visibility_of_element_located((By.ID,'j_idt179_modal'))) # wait until modal disappeared
wait.until(EC.element_to_be_clickable((By.ID,'j_idt102:uevcb_label'))).click() # unit drop-down
wait.until(EC.element_to_be_clickable((By.ID, 'j_idt102:uevcb_filter'))).send_keys('SAMA' + u'\ue007') # select unit
wait.until(EC.element_to_be_clickable((By.ID,'j_idt102:goster'))).click() # click Apply
wait.until_not(EC.visibility_of_element_located((By.ID,'j_idt15'))) # wait until modal disappeared
soup = BeautifulSoup(driver.page_source)
....