Crawl table data from 2 drop down menu

Crawl table data from 2 drop down menu - python

I have this website: https://www.adbc.gov.ae/BusinessActivityInfo/BusinessActivity.aspx?culture=en-US
This website has 2 dropdown menu: the Category and SubCategory. After choosing the Category and SubCategory, it will display a table, different Category and SubCategory will display a different table. How can I crawl this table for each Category and SubCategory.
This is what I try so far:
url = 'https://www.adbc.gov.ae/BusinessActivityInfo/BusinessActivity.aspx?culture=en-US'
req = requests.get(url)
soup = BeautifulSoup(req.text, "lxml")
content = soup.find("select",{"name":"ddlNatureId"})
options = content.find_all("option")
options1 = [y.text for y in options]
options1
The output:
['',
'ADVOCATE OFFICES',
'AGENCIES',
'AGRICULTURE',
'AGRICULTURE, LIVESTOCK AND FISHERIES ACTIVITIES',
'ANIMAL HUSBANDRY',
'ANIMAL SHELTERING SERVICES',
'ART GALLERY',
'AUDITING OFFICES',
'BAKERIES AND SWEETS',
...
]
Update:
This is what I got so far. I found that using Selenium to select the value of the dropdown list. This is my code:
Some libraries:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.expected_conditions import presence_of_element_located
from selenium.webdriver.support.ui import Select
import time
import sys
from bs4 import BeautifulSoup
import requests
Set up webdriver:
url = 'https://www.adbc.gov.ae/BusinessActivityInfo/BusinessActivity.aspx?culture=en-US'
chrome_driver_path = 'D:\\work\\crawl data\\selenium_project\\chromedriver.exe'
chrome_options = Options()
chrome_options.add_argument('--headless')
webdriver = webdriver.Chrome(
executable_path=chrome_driver_path, options=chrome_options
)
Load the website and scrape data code:
with webdriver as driver:
# Set timeout time
wait = WebDriverWait(driver, 10)
# retrive url in headless browser
driver.get(url)
# find select box
search = Select(driver.find_element_by_id("ddlNatureId"))
search.select_by_value('ADVOCATE OFFICES')
req = requests.get(url)
soup = BeautifulSoup(req.text, "lxml")
price=soup.find("select",{"name":"ddlSubCategId"})
options = price.find_all("option")
options1 = [y.text for y in options]
driver.close()
print(options1)
Output:
[]
Expected output (It should be the list of SubCategory which Category is 'ADVOCATE OFFICES'):
['',
'Advertising Agent',
'Advocate Offices',
'Agricultural Equipment And Tools Rental',
'Air Transport',
'Agents',
...
]
My problem right now is I cannot get the data of SubCategory when I select the Category. How can I solve this problem?

Related

How to scrape a site by loading content via triggering load more button?

I need to scrape the titles for all blog post articles via a Load More button as set by my desired range for i in range(1,3):
At present I'm only able to capture the titles for the first page even though i'm able to navigate to the next page using selenium.
Update:
In a previous question (How To Scrape Content With Load More Pages Using Selenium Python) by myself the pagination url was captured via:
Network Tab > Reload Page > Click Show more button > Select wp-admin/admin-ajax.php?...... Right Click Copy > Copy Link Address.
However, i do not know how to capture similar url for the site learnwoo.com/blog. I'm not sure if it uses a different technique.
Any help would be much appreciated.
from bs4 import BeautifulSoup
import pandas as pd
import requests
import time
# Selenium Routine
from requests_html import HTMLSession
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
# Removes SSL Issues With Chrome
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--ignore-ssl-errors')
options.add_argument('--ignore-certificate-errors-spki-list')
options.add_argument('log-level=3')
options.add_argument('--disable-notifications')
#options.add_argument('--headless') # Comment to view browser actions
# Get website url
urls = "https://learnwoo.com/blog/"
r = requests.get(urls)
driver = webdriver.Chrome(executable_path="C:\webdrivers\chromedriver.exe",options=options)
driver.get(urls)
productlist = []
for i in range(1,3):
# Get Page Information
soup = BeautifulSoup(r.content, features='lxml')
items = soup.find_all('div', class_ = 'td_module_1')
print(f'LOOP: start [{len(items)}]')
for single_item in items:
title = single_item.find('h3').text.strip()
print('Title:', title)
product = {
'Title': title,
}
productlist.append(product)
print()
time.sleep(5)
WebDriverWait(driver, 40).until(EC.element_to_be_clickable((By.XPATH,"//a[#id='next-page-tdi_5']"))).send_keys(Keys.ENTER)
driver.close()
# Save Results
df = pd.DataFrame(productlist)
df.to_csv('Results.csv', index=False)

Alternative solution: You can use API response to extract the desired data.From API response,I'm getting total 74 items where each page contains 6 items.
import pandas as pd
import requests
from bs4 import BeautifulSoup
params = {
'id': '',
'post_id': '0',
'slug': 'home',
'canonical_url': 'https://jooble.org/blog/',
'posts_per_page': '6',
'page': '0',
'offset': '20',
'post_type': 'post',
'repeater': 'default',
'seo_start_page': '1',
'preloaded': 'false',
'preloaded_amount': '0',
'lang': 'en',
'order': 'DESC',
'orderby': 'date',
'action': 'alm_get_posts',
'query_type': 'standard',
}
headers= {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36',
'x-requested-with': 'XMLHttpRequest'
}
api_url='https://jooble.org/blog/wp-admin/admin-ajax.php'
productlist= []
for params['page'] in range(0,13):
req = requests.get(api_url,params=params,headers=headers)
e = req.json()['html']
soup = BeautifulSoup(e,'lxml')
items = soup.find_all('div', class_ = 'front__news-content-wrapper')
for single_item in items:
title = single_item.find('div', class_ = 'front__news-title')
title=title.text.strip() if title else None
product = {
'Title': title,
}
productlist.append(product)
df = pd.DataFrame(productlist)
print(df)
Output:
Title
0 How to become an anesthesiologist
1 How to Become a Flight Attendant
2 How To Become An Influencer
3 How to Become an Electrician
4 3 Common Job Scams You Should Stay Away From
.. ...
69 Exploring Main Types of Remote Work
70 14 books HR specialist should read. Part 2
71 14 books HR specialist should read. Part 1
72 Don’t do that: 7 mistakes ruining your job int...
73 Virtual job interview. Jooble tips how to nail it
[74 rows x 1 columns]

To answer your question in selenium context, you could call .click():
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH,"//a[#id='next-page-tdi_5']"))).click()
Concerning your xhr request comment - Note: Here it is not a GET it is a POST request (https://learnwoo.com/wp-admin/admin-ajax.php?td_theme_name=Newspaper&v=11) and you have to send some additional payload with requests
Example
This example is based on selenium 4 and uses its imports, may check https://www.selenium.dev/documentation/webdriver/getting_started/upgrade_to_selenium_4/#python-1
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
urls = "https://learnwoo.com/blog/"
driver.get(urls)
productlist = []
for i in range(1,3):
soup = BeautifulSoup(driver.page_source)
items = soup.find_all('div', class_ = 'td_module_1')
print(f'LOOP: start [{len(items)}]')
for single_item in items:
product = {
'Title': single_item.find('h3').text.strip(),
}
productlist.append(product)
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH,"//a[#id='next-page-tdi_5']"))).click()
pd.DataFrame(productlist)
Output
Title
0 5 Futuristic eCommerce Trends
1 9 Best Shopify Apps to Print Shipping Labels
2 Cloud Hosting VS VPS Hosting: A Comparison
3 10 Best WooCommerce Facebook Integration Plugins
4 Complete Guide to BigCommerce Security
... ...
91 How To Calculate ROI of Your Moodle LMS?
92 How and Where to Find Help for WordPress Begin...
93 Expert Speaks: In Conversation with Amir Helze...
94 A Complete Analysis: NetSuite WooCommerce Inte...
95 Review of Currency Switcher & Converter for Sh...
96 rows × 1 columns

Here is an alternative to HedgHog's response: maybe the better way here is to use a while loop, as we don't know how many entries there are. I used a counter to breakout of the loop after the 5th loading - if you want to get all those entries, you just remove the counter.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time as t
import pandas as pd
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument("window-size=1280,720")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
wait = WebDriverWait(browser, 20)
big_list = []
counter = 1
url = 'https://learnwoo.com/blog/'
browser.get(url)
while True:
try:
load_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'a[aria-label="load_more"]')))
load_button.click()
counter = counter + 1
print('clicked to load more')
t.sleep(3)
entries = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'h3[class="entry-title td-module-title"]')))
print('we have', len(entries), 'articles')
big_list = [(x.text, x.find_element(By.TAG_NAME, 'a').get_attribute('href')) for x in entries]
if counter > 5:
break
except Exception as e:
print('all done')
break
df = pd.DataFrame(big_list, columns = ['Article', 'Url'])
print(df)
Result:
Article Url
0 5 Futuristic eCommerce Trends https://learnwoo.com/future-ecommerce-trends/
1 9 Best Shopify Apps to Print Shipping Labels https://learnwoo.com/best-shopify-apps-print-s...
2 Cloud Hosting VS VPS Hosting: A Comparison https://learnwoo.com/cloud-hosting-vps-hosting/
3 10 Best WooCommerce Facebook Integration Plugins https://learnwoo.com/best-woocommerce-facebook...
4 Complete Guide to BigCommerce Security https://learnwoo.com/bigcommerce-security-guide/
... ... ...
286 A Comparison Between Omnichannel and Multichan... https://learnwoo.com/omnichannel-multichannel-...
287 8 Winning Techniques for Off-page SEO https://learnwoo.com/winning-techniques-for-of...
288 WooCommerce – How to Understand User Roles and... https://learnwoo.com/woocommerce-understand-us...
289 7 Best Free WooCommerce Catalog Mode Plugins (... https://learnwoo.com/free-woocommerce-catalog-...
290 Different WooCommerce Product Types Explained ... https://learnwoo.com/woocommerce-different-pro...
291 rows × 2 columns

Showing null or nothing by fetching data from website with Selenium Python

There is I want to fetch activeness of website and uptime data but only I can get is active but no uptime data. I do target the exact class of the website but show null. Can you please modify my code, where I am doing something wrong. please help!
//Python code:
from bs4 import BeautifulSoup
import time
from selenium import webdriver
import soupsieve
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
time.sleep(5)
cosmos = "https://www.mintscan.io/cosmos/validators/cosmosvaloper1we6knm8qartmmh2r0qfpsz6pq0s7emv3e0meuw"
driver.get(cosmos)
time.sleep(8)
soup = BeautifulSoup(driver.page_source, 'lxml')
driver.close()
uptime = soup.find('li', {'class': "InfoRow_container__2xTzg"})
uptime_dep = uptime.find('div', {'class': "InfoRow_value__1CHna"}).string
print(uptime_dep)
acitve = soup.find('div', {'class': "ValidatorInfo_statusBadge__PBIGr"})
para = acitve.find('p').string
print(para)

Personally, I'd skip selenium and get the data from the api.
import requests
status_dict = {
2:'Inactive',
3:'Active'}
url = 'https://api.cosmostation.io/v1/staking/validator/cosmosvaloper1we6knm8qartmmh2r0qfpsz6pq0s7emv3e0meuw'
jsonData = requests.get(url).json()
uptime = 1 - (jsonData['uptime']['missed_blocks'] / jsonData['uptime']['over_blocks'])
status = jsonData['status']
print(uptime)
print(status_dict[status])
Ouput:
0.91
Active
Here's the json response:
print(jsonData)
{'rank': 38, 'account_address': 'cosmos1we6knm8qartmmh2r0qfpsz6pq0s7emv3um0vsa', 'operator_address': 'cosmosvaloper1we6knm8qartmmh2r0qfpsz6pq0s7emv3e0meuw', 'consensus_pubkey': 'cosmosvalconspub16adydsk7nw3d63qtn30t5rexhfg56pq44sw4l9ld0tcj6jvnx30sm7h6lm', 'bonded_height': 0, 'bonded_time': '0001-01-01T00:00:00Z', 'jailed': False, 'status': 3, 'tokens': '1034297369481', 'delegator_shares': '1034297369481.000000000000000000', 'moniker': 'Staked', 'identity': 'E7BFA6515FB02B3B', 'website': 'https://staked.us/', 'details': 'Staked operates highly available and highly secure, institutional grade staking infrastructure for leading proof-of-stake (PoS) protocols.', 'unbonding_height': '0', 'unbonding_time': '1970-01-01T00:00:00Z', 'rate': '0.100000000000000000', 'max_rate': '0.200000000000000000', 'max_change_rate': '0.020000000000000000', 'update_time': '2019-03-13T23:00:00Z', 'uptime': {'address': '7B3A2EFE5B3FCDF819FCF52607314CEFE4754BB6', 'missed_blocks': 9, 'over_blocks': 100}, 'min_self_delegation': '1', 'keybase_url': ''}

Selenium-based solution:
You can use the below XPath:
//div[text()='Uptime']//following::div
to retrieve Uptime from the HTMLDOM.
Code:
wait = WebDriverWait(driver, 30)
cosmos = "https://www.mintscan.io/cosmos/validators/cosmosvaloper1we6knm8qartmmh2r0qfpsz6pq0s7emv3e0meuw"
driver.get(cosmos)
time.sleep(8)
soup = BeautifulSoup(driver.page_source, 'lxml')
#driver.close()
# uptime = soup.find('li', {'class': "InfoRow_container__2xTzg"})
# uptime_dep = uptime.find('div', {'class': "InfoRow_value__1CHna"}).string
# print(uptime_dep)
ele = wait.until(EC.visibility_of_element_located((By.XPATH, "//div[text()='Uptime']//following::div")))
print(ele.text)
acitve = soup.find('div', {'class': "ValidatorInfo_statusBadge__PBIGr"})
para = acitve.find('p').string
print(para)
Imports:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
Output:
97%
Active

Beautifulsoup/Selenium how to scrape website until next page is disabled?

So I have a list of urls (called "data") that contains urls like
https://www.amazon.com/Airpods-Fashion-Protective-Accessories-Silicone/product-reviews/B08YD8JLNQ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
and
https://www.amazon.com/Keychain-R-fun-Protective-Accessories-Visible-Sky/product-reviews/B082W7DL1R/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
Some urls do not have the "Next Page" icon and some do. So far my code is something like this
from bs4 import BeautifulSoup
import requests
import csv
import os
import pandas as pd
from selenium import webdriver
from selenium.common import exceptions
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
df = pd.read_csv(r'path to csv file', sep=',', usecols=['Url'], squeeze=True)
data = pd.read_csv(r'path to csv file', sep=',', usecols=['Url'], squeeze=True)
rows = []
for url in data
page = requests.get(url)
soup = bs(page.content, 'html.parser')
soup.prettify
#names = soup.find_all('span', class="a-profile-name")
# div.celwidget div.aok-relative span.a-profile-name
#names = soup.find_all('div.celwidget div.aok-relative span', class= "a-profile-name")
names = soup.find_all('div.celwidget div.aok-relative span.a-profile-name')
rating = soup.find_all('div.celwidget div.aok-relative span.a-icon-alt')
title = soup.find_all('div.celwidget div.aok-relative a.a-text-bold span')
content = soup.find_all('div.celwidget div.aok-relative span.review-text-content span')
I want to scrape the names, ratings and etc from the reviews until the last item where the Next Page button would be disabled.
I'm not quite sure what to do from here, I looked around and many questions related to this was using .click() on Next Page which I don't think is the answer I need/want.

The next page url is stored in a list item with class name a-last. So you could create a while loop that breaks if soup.find('li', class_='a-last') returns nothing anymore (i.e. if the last page has been reached):
from selenium import webdriver
from bs4 import BeautifulSoup
import time
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
url='https://www.amazon.com/Keychain-R-fun-Protective-Accessories-Visible-Sky/product-reviews/B082W7DL1R/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews' #or https://www.amazon.com/s?k=maison+kitsune+airpod+pro+case
wd = webdriver.Chrome('chromedriver',options=options)
while True:
wd.get(url)
soup = BeautifulSoup(wd.page_source, "html.parser")
#store data here
try:
url = 'https://www.amazon.com/' + soup.find('li', class_='a-last').find('a', href=True)['href']
time.sleep(2) #prevent ban
except:
break

How to find the nth child heading and print the text using beautifulsoup in python

According to my code, I am able to get the First heading of Project and I want the subheading to be printed (FSI Details). Not able to get the second heading using beautifulsoup.I tried the reference for the nth-child
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd
import os
url = 'https://maharerait.mahaonline.gov.in'
chrome_path = r'C:/Users/User/AppData/Local/Programs/Python/Python36/Scripts/chromedriver.exe'
driver = webdriver.Chrome(executable_path=chrome_path)
driver.get(url)
WebDriverWait(driver,
20).until(EC.element_to_be_clickable((By.XPATH,"//div[#class='search-
pro-details']//a[contains(.,'Search Project Details')]"))).click()
Registered_Project_radio= WebDriverWait(driver,
10).until(EC.element_to_be_clickable((By.ID,"Promoter")))
driver.execute_script("arguments[0].click();",Registered_Project_radio)
Application = driver.find_element_by_id("CertiNo")
Application.send_keys("P50500000005")
Search = WebDriverWait(driver,
10).until(EC.element_to_be_clickable((By.ID,"btnSearch")))
driver.execute_script("arguments[0].click();",Search)
View = [item.get_attribute('href') for item in
driver.find_elements_by_tag_name("a") if
item.get_attribute('href') is not None]
View = View[0]
driver.get(View)
request = urllib.request.Request(View)
html = urllib.request.urlopen(request).read()
soup = BeautifulSoup(html, 'html.parser')
divPInfo2 = soup.find("div", {"id": "DivProject"})
Project_title = divPInfo2.find("div", {'class': 'x_panel'},
recursive=False).find("div", {'class': 'x_title'}).find(
"h2").text.strip()
print(Project_title)
Project_title1 = divPInfo2.find("div", {'class': 'x_panel'},
recursive=False).find("div", {'class': 'x_title'}).find_all(
"h2")[1].text.strip()
print(Project_title1 ) # (FSI Detail) heading should be printed here

You can try CSS selector :contains("FSI Details"), which selects element containing string "FSI Details". This code prints labels and values of the "FSI Details" section:
import requests
from bs4 import BeautifulSoup
url = 'https://maharerait.mahaonline.gov.in/PrintPreview/PrintPreview?q=BPUvrrjIzYs%2f2hwYj1YIOfflh9NisZW6zTns2KLjHBZn6cbQ008s91nzlFrDxVvLwR1vAeLID0%2bo%2bD0H0Z6o2t%2b5P%2b%2fbBOcHCbMQHU8gkwdNZJnbbfu6N7mWSpgKXt4AiQyzuEpoDE7FX6HZypqsGXz4ObYD4KpyRzCsFJaWTgA%3d'
soup = BeautifulSoup(requests.get(url).text, 'lxml')
fsi_content = soup.select_one('.x_title:contains("FSI Details") + .x_content')
print('{: <160}{: <8}'.format('Label', 'Value'))
print('-' * 168)
for label, text in zip(fsi_content.select('label'), fsi_content.select('div:has(> label) + div')):
print('{: <160}{: <8}'.format(label.get_text(strip=True), text.get_text(strip=True)))
Prints:
Label Value
------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Built-up-Area as per Proposed FSI (In sqmts) ( Proposed but not sanctioned) ( As soon as approved, should be immediately updated in Approved FSI) 0
Built-up-Area as per Approved FSI (In sqmts) 11566.50
TotalFSI 11566.50
Further reading:
CSS Selectors Refernece

Scraping from dropdown menus with Python

I am a newbie with Python and trying to retrieve data within
this Site using Python version 3.6.0
There are 2 dropdowns and second's data depends on the first's selection.
First: 'Organizasyon Adi'
Second: 'UEVCB Adi'
All options from the source is like:
<option value="0" selected="selected">TÜMÜ</option> #this is default value when we open the page
<option value="10374">1461 TRABZON ELEKTRİK ÜRETİM A.Ş</option>
<option value="9426">2M ELEKTRİK ÜRETİM SANAYİ VE TİCARET ANONİM ŞİRKETİ</option>
These are options for firs Dropdown and there are almost 800 options.
We cant see the second Dropdowns options without inspecting the page unless the second Dropdown box is clicked. (Both dropdowns opens a searchbox when clicked.)
Second Dropdown opens a list of units for selected organisation.
When options from two Dropdowns are selected it generates a table data and we're trying to get data for all units.
I couldn't make it to scrap data for all units with one program, so i decided to scrap them individually.
With this code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.action_chains import ActionChains
import time
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://seffaflik.epias.com.tr/transparency/uretim/planlama/kgup.xhtml' #
driver = webdriver.Chrome()
driver.get(url)
time.sleep(3)
organisation = driver.find_element_by_xpath(".//*[#id='j_idt102:distributionId_label']")
organisation.click()
dropdown1 = driver.find_element_by_xpath(".//*[#id='j_idt102:distributionId_filter']")
dropdown1.send_keys('1461')
dropdown1.send_keys(u'\ue007')
unit = driver.find_element_by_id('j_idt102:uevcb_label')
dropdown2 = driver.find_element_by_xpath(".//*[#id='j_idt102:uevcb_filter']")
dropdown2.send_keys('SAMA')
dropdown2.send_keys(u'\ue007')
apply= driver.find_element_by_xpath("//*[#id='j_idt102:goster']")
apply.click()
time.sleep(5)
soup = BeautifulSoup(driver.page_source)
table = soup.find_all('table')[0]
rows = table.find_all('tr')[1:]
data = {
'01.Date' : [],
'02.Hour' : [],
'03.NaturalGas' : [],
'04.Wind' : [],
'05.Lignite' : [],
'06.Hard_Coal' : [],
'07.ImportedCoal' : [],
'08.Geothermal' : [],
'09.Hydro_Dam' : [],
'10.Naphta' : [],
'11.Biomass' : [],
'12.River' : [],
'13.Other' : []
}
for row in rows:
cols = row.find_all('td')
data['01.Date'].append( cols[0].get_text() )
data['02.Hour'].append( cols[1].get_text() )
data['03.NaturalGas'].append( cols[3].get_text() )
data['04.Wind'].append( cols[4].get_text() )
data['05.Lignite'].append( cols[5].get_text() )
data['06.Hard_Coal'].append( cols[6].get_text() )
data['07.ImportedCoal'].append( cols[7].get_text() )
data['08.Geothermal'].append( cols[8].get_text() )
data['09.Hydro_Dam'].append( cols[9].get_text() )
data['10.Naphta'].append( cols[10].get_text() )
data['11.Biomass'].append( cols[11].get_text() )
data['12.River'].append( cols[12].get_text() )
data['13.Other'].append( cols[13].get_text() )
df = pd.DataFrame( data )
writer = pd.ExcelWriter('//192.168.0.102/Data/kgup.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1')
writer.save()
time.sleep(5)
driver.close()
By this code we can select from first dropdown using search function and Enter key.
When it comes to second, it generates ImportError: sys.meta_path is None, Python is likely shutting down
How should I handle this?
Thanks.

Your code seem to be sensitive to StaleElementException as well as to exception Element is not clickable at point.... Try below code for web-scraping part and let me know the result:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://seffaflik.epias.com.tr/transparency/uretim/planlama/kgup.xhtml' #
driver = webdriver.Chrome()
driver.get(url)
wait = WebDriverWait(driver, 20)
driver.maximize_window()
wait.until_not(EC.visibility_of_element_located((By.ID,'j_idt15'))) # wait until modal disappeared
wait.until(EC.element_to_be_clickable((By.ID,'j_idt102:distributionId_label'))).click() # organization drop-down
wait.until(EC.element_to_be_clickable((By.ID, 'j_idt102:distributionId_filter'))).send_keys('1461' + u'\ue007') # select required
wait.until_not(EC.visibility_of_element_located((By.ID,'j_idt179_modal'))) # wait until modal disappeared
wait.until(EC.element_to_be_clickable((By.ID,'j_idt102:uevcb_label'))).click() # unit drop-down
wait.until(EC.element_to_be_clickable((By.ID, 'j_idt102:uevcb_filter'))).send_keys('SAMA' + u'\ue007') # select unit
wait.until(EC.element_to_be_clickable((By.ID,'j_idt102:goster'))).click() # click Apply
wait.until_not(EC.visibility_of_element_located((By.ID,'j_idt15'))) # wait until modal disappeared
soup = BeautifulSoup(driver.page_source)
....

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Crawl table data from 2 drop down menu - python

Related

How to scrape a site by loading content via triggering load more button?

Showing null or nothing by fetching data from website with Selenium Python

Beautifulsoup/Selenium how to scrape website until next page is disabled?

How to find the nth child heading and print the text using beautifulsoup in python

Scraping from dropdown menus with Python

Categories

Resources