PYTHON: appending a dataframe in a loop

PYTHON: appending a dataframe in a loop - python

I'm trying to retrieve stock information from 2 different urls and write the information to a panda's dataframework. However, i keep on getting errors. Could anyone please help me out here?
I'm pretty new to python, so my code will probably look very ugly :D
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers= {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'max-age=0'
}
PATH='C:\Program Files (x86)\chromedriver.exe'
options = Options()
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument("--window-size=2550,1440")
s = Service('C:\Program Files (x86)\chromedriver.exe')
driver = webdriver.Chrome(PATH, options=options)
driver.implicitly_wait(10)
#maak een dataframe aan
dn=[]
def accept_cookies():
try:
driver.find_element(By.ID, 'accept-choices').click()
except:
print('fu')
stocklist=["FB","KLIC"]
for x in stocklist:
url = f"https://stockanalysis.com/stocks/{x}/financials/"
driver.get(url)
driver.implicitly_wait(10)
accept_cookies()
driver.implicitly_wait(10)
driver.find_element(By.XPATH, "//span[text()='Quarterly']").click()
xlwriter = pd.ExcelWriter(f'financial statements1.xlsx', engine='xlsxwriter')
soup = BeautifulSoup(driver.page_source, 'html.parser')
df = pd.read_html(str(soup), attrs={'id': 'financial-table'})[0]
new_df = pd.concat(df)
dn.to_excel(xlwriter, sheet_name='key', index=False)
xlwriter.save()

pd.concat needs a list of objects to concatenate, whereas you have only given it df.
So I think replace pd.concat(df) with pd.concat([df, new_df]) and have new_df = pd.DataFrame() before the for loop.

In case that there is no issue with the .read_html() part you should push your df to a list of data frames:
dflist =[]
for x in stocklist:
url = f"https://stockanalysis.com/stocks/{x}/financials/"
driver.get(url)
driver.implicitly_wait(10)
accept_cookies()
driver.implicitly_wait(10)
driver.find_element(By.XPATH, "//span[text()='Quarterly']").click()
soup = BeautifulSoup(driver.page_source, 'html.parser')
dflist.append(pd.read_html(str(soup), attrs={'id': 'financial-table'})[0])
Finishing the iteration you can simply concat the list of data frames to a single one:
xlwriter = pd.ExcelWriter(f'financial statements1.xlsx', engine='xlsxwriter')
pd.concat(dflist).to_excel(xlwriter, sheet_name='key', index=False)
xlwriter.save()
example
dflist =[]
for x in stocklist:
url = f"https://stockanalysis.com/stocks/{x}/financials/"
driver.get(url)
driver.implicitly_wait(10)
accept_cookies()
driver.implicitly_wait(10)
driver.find_element(By.XPATH, "//span[text()='Quarterly']").click()
soup = BeautifulSoup(driver.page_source, 'html.parser')
dflist.append(pd.read_html(str(soup), attrs={'id': 'financial-table'})[0])
xlwriter = pd.ExcelWriter(f'financial statements1.xlsx', engine='xlsxwriter')
pd.concat(dflist).to_excel(xlwriter, sheet_name='key', index=False)
xlwriter.save()

Related

How can I web scrape a government website with python? I cannot properly do it, the table just cannot show

I am trying to web scrape the data from this government website: https://www.itf.gov.hk/en/project-search/search-result/index.html?isAdvSearch=1&Programmes=TVP
However, after reading a lot about web scrapping, following Youtube video, I still can’t do it. Can someone please help?
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = 'https://www.itf.gov.hk/en/project-search/project-profile/index.html?ReferenceNo=TVP/2122/22'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
soup
table = soup.find('table',{'class':'colorTbl projDetailTbl'})
headers=[]
for i in table.find_all('th'):
title = i.text.strip()
headers.append(title)
df = pd.DataFrame(columns=headers)
for row in table.find_all('tr')[1:]:
data = row.find_all('td')
row_data = [td.text.strip() for td in data]
length = len(df)
df.loc[length] = row_data
The table didn't show at all. The result is none. Please help me.

The table is rendered through javascript, and the data returned through an api. You need to get the data from the source.
Once you have the "Reference", you then can feed those into the api again to get the "linked" data. And finally merge them together.
Code:
import pandas as pd
import requests
tokenUrl = 'https://www.itf.gov.hk/API/Token/Get'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
}
token = requests.get(tokenUrl, headers=headers).text
url = 'https://www.itf.gov.hk/API/Project/Search'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36',
'verificationtoken': token}
# Get Full Tables
page = 1
rows = []
while True:
payload = {
'Page': '%s' %page,
'Programmes[]': 'TVP'}
jsonData = requests.post(url, headers=headers, data=payload).json()
rows += jsonData['Records']
page += 1
print(f"{len(rows)} of {jsonData['Total']}")
if len(rows) == jsonData['Total']:
print('Complete')
break
df = pd.DataFrame(rows)
references = list(df['Reference'])
# Use References to get "linked" data
refRows = []
for ref in references:
print(ref)
url = 'https://www.itf.gov.hk/API/Project/Get'
payload = {
'Reference':ref}
jsonData = requests.post(url, headers=headers, data=payload).json()
TechnologicalSolutions = jsonData['TechnologicalSolutions'][0]
row = jsonData
row.update(TechnologicalSolutions)
refRows.append(row)
refDf = pd.DataFrame(refRows)
# Merge together
df = df.merge(refDf, how='left', on = ['Reference'])
Output:
print(df)
Reference ... SCName
0 TVP/2122/22 ... 销售点管理系统
1 TVP/2120/22 ... 销售点管理系统
2 TVP/2107/22 ... 企业资源规划方案
3 TVP/2105/22 ... 企业资源规划方案
4 TVP/2103/22 ... 电子库存管理系统
5 TVP/2101/22 ... 文件管理及流动存取系统
6 TVP/2097/22 ... 销售点管理系统
7 TVP/2092/22 ... 企业资源规划方案
8 TVP/2086/22 ... 销售点管理系统
9 TVP/2085/22 ... 企业资源规划方案
[10 rows x 66 columns]

After inspecting the website, I see no such class projDetailTbl. However, you can use the following code to be able to process the data from the website.
table = soup.find('table', class_='colorTbl')
or you can search on id if there happen to be more classes of the same name.
table = soup.find('table', id='searchResultOutput')

I'm afraid that it could not be done as your site generates the table with javascript. The body of the html that you are getting in your soup is without the data:
<body>
<div id="content"><script src="/filemanager/template/common/js/search.js" type="text/javascript"></script>
<h1 class="a_center">Project Profile</h1>
<div class="tableResponsive" id="projectProfile"> 
<table class="colorTbl projDetailTbl">
<tbody></tbody>
</table>
<div class="remarks" id="techAreaRemark" style="display: none;">* The primary technology area as indicated by the project coordinator is placed first.</div>
</div></div>
</body>
That is the soup (part of it) that you are getting with :
url = 'https://www.itf.gov.hk/en/project-search/project-profile/index.html?ReferenceNo=TVP/2122/22'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
The content of the page is filled up afterwords with:
<div> id="content"><script src="/filemanager/template/common/js/search.js" type="text/javascript"></script> ...
...
</div>
Your code is probably ok but the page you are scraping is not filled with data yet, when you put a request.
Maybe you could try to get it using Selenium.....
Regards...

I used selenium and webdriver_manager to work with javascript execution
To install selenium run pip install selenium and to automatically load the drivers, install webdriver_manager pip install webdriver-manager
Here is my code (worked for me):
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager # automatic webdriver for Chrome browser (can change to your browser)
URL = 'https://www.itf.gov.hk/en/project-search/project-profile/index.html?ReferenceNo=TVP/2122/22'
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml; q=0.9,image/webp,image/apng,*/*;q=0.8"
}
# opening the page and get elements from the table
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(options=options, executable_path=ChromeDriverManager().install())
driver.get(URL)
time.sleep(4) # falling asleep (4 sec) to accurately load the table on the site
# getting the name of the line and its content
data = {}
index_elem = 1
while True:
# through the lines until we reach the non-existent
try:
columns_name = driver.find_element(
By.XPATH, f'//*[#id="projectProfile"]/table/tbody/tr[{index_elem}]/th').text
columns_content = driver.find_element(
By.XPATH, f'//*[#id="projectProfile"]/table/tbody/tr[{index_elem}]/td').text
data[columns_name] = [columns_content]
index_elem += 1
except NoSuchElementException:
break
df = pd.DataFrame(data)
print(df)
Output:
Project Reference ... Technological Solution(s)
0 TVP/2122/22 ... Point-of-Sales (POS) System

Insert value in searchbar, select autocomplete result and get value by bs4

I am trying to use Beautiful Soup to read a value from a web page. The following steps are necessary:
go to the webpage:
url = 'https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings/funds/'
insert the ISIN in the searchbar
3. select the autocomplete-results from the container msci-ac-search-data-dropdown (click)
4. read the value from the "div class: ratingdata-outercircle esgratings-profile-header-green" to get the text: "ratingdata-fund-rating esg-fund-ratings-circle-aaa".
so far i have tried the following:
import requests
from bs4 import BeautifulSoup
isin = 'IE00B4L5Y983'
url = 'https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings/funds/'
soup = BeautifulSoup( requests.get(url).content, 'html.parser' )
payload = {}
for i in soup.select('form[action="https://www.msci.com/search"] input[value]'):
payload[i['name']] = i['value']
payload['UQ_txt'] = isin

Try:
import requests
from bs4 import BeautifulSoup
isin = "IE00B4L5Y983"
url = "https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0",
"X-Requested-With": "XMLHttpRequest",
}
params = {
"p_p_id": "esg_fund_ratings_profile",
"p_p_lifecycle": "2",
"p_p_state": "normal",
"p_p_mode": "view",
"p_p_resource_id": "searchFundRatingsProfiles",
"p_p_cacheability": "cacheLevelPage",
"_esg_fund_ratings_profile_keywords": isin,
}
data = requests.get(url, params=params, headers=headers).json()
params = {
"p_p_id": "esg_fund_ratings_profile",
"p_p_lifecycle": "2",
"p_p_state": "normal",
"p_p_mode": "view",
"p_p_resource_id": "showEsgFundRatingsProfile",
"p_p_cacheability": "cacheLevelPage",
"_esg_fund_ratings_profile_fundShareClassId": data[0]["url"],
}
headers["Referer"] = "https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings/funds/{}/{}".format(
data[0]["encodedTitle"], data[0]["url"]
)
soup = BeautifulSoup(
requests.get(url, params=params, headers=headers).content, "html.parser"
)
data = soup.select_one(".ratingdata-fund-rating")["class"]
print(data)
Prints:
['ratingdata-fund-rating', 'esg-fund-ratings-circle-aaa']

When you press enter, you send another request, which already shows the search result. Here is an example of how to get what you want
import requests
isin = 'IE00B4L5Y983'
url = f"https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings?p_p_id=esg_fund_ratings_profile&p_p_lifecycle=2&p_p_state=normal&p_p_mode=view&p_p_resource_id=searchFundRatingsProfiles&p_p_cacheability=cacheLevelPage&_esg_fund_ratings_profile_keywords={isin}"
for title in requests.get(url).json():
print(title['title'])
OUTPUT:
iShares Core MSCI World UCITS ETF USD (Acc)

If I may: from the OP's description I can only infer this is either an education related test, either a job interview related test. As such, following the exact instructions is paramount. In order to follow said instructions, you can only use selenium. The following code will work 'a la point', and get the desired result:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--headless")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
url = 'https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings/funds/'
browser.get(url)
WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.ID, '_esg_fund_ratings_profile_keywords'))).send_keys('IE00B4L5Y983')
WebDriverWait(browser, 20).until(EC.visibility_of_element_located((By.ID, 'ui-id-1')))
result = browser.find_element(By.ID, "ui-id-1")
result.click()
WebDriverWait(browser, 20).until(EC.visibility_of_element_located((By.CLASS_NAME, 'esgratings-profile-header-green')))
result = browser.find_element(By.CLASS_NAME, "esgratings-profile-header-green").find_element(By.TAG_NAME, "div").get_attribute('class')
print(result)
browser.quit()
This will return:
ratingdata-fund-rating esg-fund-ratings-circle-aaa

Writing to CSV and only getting one column

I'm trying to write to CSV file and am only getting 1 column with the company names. Does anyone know how I can write to a CSV file and get all the data with column headings?
Printed Output
'Agilent Technologies\n6,319\n2,912\n441\n1,619\n321\n189\n189\n1,347\n81
\n236\n1,210\n19.2%\n307', 'Alcoa\n12,152\n9,153\n31\n227\n664\n390\n390\n2,039\n195\n19\n429\n3.5%\n190',
Current Output after writing to CSV
Agilent Technologies
Alcoa
Desired Output after writing to CSV
Full Code
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
import pandas as pd
import requests
import csv
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
ser = Service("./chromedriver.exe")
browser = driver = webdriver.Chrome(service=ser)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
driver.execute_cdp_cmd("Network.enable", {})
driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36'})
wait = WebDriverWait(driver, 30)
driver.get("https://stockrover.com")
wait.until(EC.visibility_of_element_located((By.XPATH, "/html/body/div[1]/div/section[2]/div/ul/li[2]"))).click()
user = driver.find_element(By.NAME, "username")
password = driver.find_element(By.NAME, "password")
user.clear()
user.send_keys("******")
password.clear()
password.send_keys("*******")
driver.find_element(By.NAME, "Sign In").click()
wait = WebDriverWait(driver, 30)
stocks_list = []
try:
while True:
# Print the stock symbols
stocks_list.extend([my_elem.text for my_elem in WebDriverWait(driver, 30).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "table[id^='gridview-1072-record']")))])
# Click on next page button
wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="button-1157"]'))).click()
except:
print("Next button disabled")
print(stocks_list) # Prints entire list of stocks
df=pd.DataFrame(stocks_list)
df.to_csv('table.csv')

You may just perform this CSV Module in the end add:
stocks_lists = [x.split('\n') for x in stocks_list]
for row in stocks_lists:
with open('output.csv', 'a', encoding='utf-8', newline='') as csv_file:
csv_write = csv.writer(csv_file)
csv_write.writerow(row)
Full Code:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
import pandas as pd
import requests
import csv
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome('G://chromedriver.exe')
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
driver.execute_cdp_cmd("Network.enable", {})
driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36'})
wait = WebDriverWait(driver, 30)
driver.get("https://stockrover.com")
wait.until(EC.visibility_of_element_located((By.XPATH, "/html/body/div[1]/div/section[2]/div/ul/li[2]"))).click()
user = driver.find_element(By.NAME, "username")
password = driver.find_element(By.NAME, "password")
user.clear()
user.send_keys("********")
password.clear()
password.send_keys("********")
driver.find_element(By.NAME, "Sign In").click()
wait = WebDriverWait(driver, 30)
stocks_list = []
try:
while True:
# Print the stock symbols
stocks_list.extend([my_elem.text for my_elem in WebDriverWait(driver, 30).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "table[id^='gridview-1072-record']")))])
# Click on next page button
wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="button-1157"]'))).click()
except:
print("Next button disabled")
# print(stocks_list) # Prints entire list of stocks
stocks_lists = [x.split('\n') for x in stocks_list]
for row in stocks_lists:
with open('output.csv', 'a', encoding='utf-8', newline='') as csv_file:
csv_write = csv.writer(csv_file)
csv_write.writerow(row)

Assuming that this is your list of strings
string_sock = ["""Agilent Technologies\n6,319\n2,912\n441\n1,619\n321\n189\n189\n1,347\n81
\n236\n1,210\n19.2%\n307""", """Alcoa\n12,152\n9,153\n31\n227\n664\n390\n390\n2,039\n195\n19\n429\n3.5%\n190"""]
Yours columns
columns = ['Company', 'Sales', 'Cost of Sales', 'R&D', 'SG&A', 'Depr', 'Capex', 'Maint Capex', 'Op income', 'Int Exp.', 'Dividends', 'Net Inc.', 'Net Margin', 'Diluted Sh', 'rs']
Split the input with the separator
splitted_rows = [v.split("\n") for v in string_sock]
pd.DataFrame(splitted_rows, columns=columns).to_csv(test.csv)

Python / Selenium / Beautiful Soup not scraping desired elements

I'm struggling to get this code to extract the desired information from one single page.
I've tried all the usual selenium tactics and added a time delay. Hopefully, it's something simple. I'm not getting any error messages.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup as bs
from time import sleep
options = Options()
options.add_argument("--headless")
options.add_argument("window-size=1400,600")
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_0_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36"
options.add_argument(f'user-agent={user_agent}')
capabilities = { 'chromeOptions': { 'useAutomationExtension': False},'args': ['--disable-extensions']}
browser = webdriver.Chrome(executable_path=r'/usr/local/bin/chromedriver',desired_capabilities = capabilities,options=options)
url='https://groceries.asda.com/product/celery-spring-onions/asda-growers-selection-trimmed-spring-onions/41676'
browser.get(url)
sleep(3)
source_data = browser.page_source
bs_data = bs(source_data,"html.parser")
#product id
try:
product_id = bs_data.findfindAll('span', {'class': 'pdp-main-details__product-code'})
product_id = product_id.replace('Product code:','').strip()
except:
product_id = "n/a"
#image address
try:
for image in bs_data.find("div", {"class":"s7staticimage"}):
image_url = image.find('img')['src']
except:
image_url = "n/a"
#product description
try:
product_desc = bs_data.find('class',{'pdp-main-pdp-main-details__title'})
product_desc = product_desc.get_text().strip()
except:
product_desc = "n/a"
#product price
try:
product_price = bs_data.find('class',{'co-product__price pdp-main-details__price'})
product_price = product_price.get_text().strip()
except:
product_price = "n/a"
print (url,'|',image_url,'|',product_id,'|',product_desc,'|',product_price)
browser.quit()
Any assistance is greatly appreciated.
Thanks

Since the content is dynamically generated, your soup has nothing in it to find. Selenium is good enough. I don't know why you have treated the elements as list because there is only one of each on this page.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument('--headless')
capabilities = { 'chromeOptions': { 'useAutomationExtension': False},'args': ['--disable-extensions']}
browser = webdriver.Chrome(executable_path='C:/bin/chromedriver.exe',desired_capabilities = capabilities,options=options)
url='https://groceries.asda.com/product/celery-spring-onions/asda-growers-selection-trimmed-spring-onions/41676'
browser.get(url)
browser.implicitly_wait(15)
product_id = browser.find_element_by_class_name('pdp-main-details__product-code')
print(product_id.text)
image = browser.find_element_by_xpath("//*[#id=\"s7viewer_flyout\"]/div[1]/img[1]")
image_url = image.get_attribute('src')
print(image_url)
Output:-
Product code: 410212
https://ui.assets-asda.com/dm/asdagroceries/5050854288142_T1?defaultImage=asdagroceries/noImage&resMode=sharp2&id=PqaST3&fmt=jpg&fit=constrain,1&wid=188&hei=188

I want to get first 10 images url from google search using Selenium Python

I want to get first 10 images url from google search (not base64).
I have code:
import os
import base64
import time
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
searchterm = 'bananas' # will also be the name of the folder
url = "https://www.google.com/search?q=banan&source=lnms&tbm=isch&sa=X&ved=2ahUKEwj-75rDlJLoAhWLHHcKHStFC6EQ_AUoAXoECA4QAw&biw=1867&bih=951"
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
browser = webdriver.Chrome(executable_path=ChromeDriverManager().install(), options=options)
browser.get(url)
actions = webdriver.common.action_chains.ActionChains(browser)
header = {
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"}
counter = 0
succounter = 0
if not os.path.exists(searchterm):
os.mkdir(searchterm)
for i in range(0, 11):
time.sleep(1)
x = browser.find_elements_by_xpath('//*[#id="islrg"]/descendant::img')[i]
x.click()
i += 1
if i > 10:
break
ba = browser.find_element_by_xpath('//*
[#id="Sva75c"]/div/div/div[3]/div[2]/div/div[1]/div[1]/div/div[2]/a/img')
print(ba.get_attribute('src'))
It returns image urls, but sometimes base64. How to make the script always return image url?
Thank you.

Change the xpath to get the link rather image, and then get the href.
ba = browser.find_element_by_xpath("//div[#class='islrc']//a[#href][#rel='noopener']")
print(ba.get_attribute("href")

You can always get only Image URLs if you scrape another search engine DuckDuckGo using the following code:
search_query = 'what you want to find'
num_images = 10
driver_location = '/put/location/of/your/driver/here'
# setting up the driver
ser = Service(driver_location)
op = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=ser, options=op)
# searching the query
driver.get(f'https://duckduckgo.com/?q={search_query}&kl=us-en&ia=web')
# going to Images Section
ba = driver.find_element(By.XPATH, "//a[#class='zcm__link js-zci-link js-zci-link--images']")
ba.click()
# getting the images URLs
for result in driver.find_elements(By.CSS_SELECTOR, '.js-images-link')[0:0+num_images]:
imageURL = result.get_attribute('data-id')
print(f'{imageURL}\n')
driver.quit()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

PYTHON: appending a dataframe in a loop - python

pd.concat needs a list of objects to concatenate, whereas you have only given it df. So I think replace pd.concat(df) with pd.concat([df, new_df]) and have new_df = pd.DataFrame() before the for loop.

Related

How can I web scrape a government website with python? I cannot properly do it, the table just cannot show

Insert value in searchbar, select autocomplete result and get value by bs4

Writing to CSV and only getting one column

Python / Selenium / Beautiful Soup not scraping desired elements

I want to get first 10 images url from google search using Selenium Python

Categories

Resources