I'm trying to write to CSV file and am only getting 1 column with the company names. Does anyone know how I can write to a CSV file and get all the data with column headings?
Printed Output
'Agilent Technologies\n6,319\n2,912\n441\n1,619\n321\n189\n189\n1,347\n81
\n236\n1,210\n19.2%\n307', 'Alcoa\n12,152\n9,153\n31\n227\n664\n390\n390\n2,039\n195\n19\n429\n3.5%\n190',
Current Output after writing to CSV
Agilent Technologies
Alcoa
Desired Output after writing to CSV
Full Code
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
import pandas as pd
import requests
import csv
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
ser = Service("./chromedriver.exe")
browser = driver = webdriver.Chrome(service=ser)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
driver.execute_cdp_cmd("Network.enable", {})
driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36'})
wait = WebDriverWait(driver, 30)
driver.get("https://stockrover.com")
wait.until(EC.visibility_of_element_located((By.XPATH, "/html/body/div[1]/div/section[2]/div/ul/li[2]"))).click()
user = driver.find_element(By.NAME, "username")
password = driver.find_element(By.NAME, "password")
user.clear()
user.send_keys("******")
password.clear()
password.send_keys("*******")
driver.find_element(By.NAME, "Sign In").click()
wait = WebDriverWait(driver, 30)
stocks_list = []
try:
while True:
# Print the stock symbols
stocks_list.extend([my_elem.text for my_elem in WebDriverWait(driver, 30).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "table[id^='gridview-1072-record']")))])
# Click on next page button
wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="button-1157"]'))).click()
except:
print("Next button disabled")
print(stocks_list) # Prints entire list of stocks
df=pd.DataFrame(stocks_list)
df.to_csv('table.csv')
You may just perform this CSV Module in the end add:
stocks_lists = [x.split('\n') for x in stocks_list]
for row in stocks_lists:
with open('output.csv', 'a', encoding='utf-8', newline='') as csv_file:
csv_write = csv.writer(csv_file)
csv_write.writerow(row)
Full Code:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
import pandas as pd
import requests
import csv
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome('G://chromedriver.exe')
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
driver.execute_cdp_cmd("Network.enable", {})
driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36'})
wait = WebDriverWait(driver, 30)
driver.get("https://stockrover.com")
wait.until(EC.visibility_of_element_located((By.XPATH, "/html/body/div[1]/div/section[2]/div/ul/li[2]"))).click()
user = driver.find_element(By.NAME, "username")
password = driver.find_element(By.NAME, "password")
user.clear()
user.send_keys("********")
password.clear()
password.send_keys("********")
driver.find_element(By.NAME, "Sign In").click()
wait = WebDriverWait(driver, 30)
stocks_list = []
try:
while True:
# Print the stock symbols
stocks_list.extend([my_elem.text for my_elem in WebDriverWait(driver, 30).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "table[id^='gridview-1072-record']")))])
# Click on next page button
wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="button-1157"]'))).click()
except:
print("Next button disabled")
# print(stocks_list) # Prints entire list of stocks
stocks_lists = [x.split('\n') for x in stocks_list]
for row in stocks_lists:
with open('output.csv', 'a', encoding='utf-8', newline='') as csv_file:
csv_write = csv.writer(csv_file)
csv_write.writerow(row)
Assuming that this is your list of strings
string_sock = ["""Agilent Technologies\n6,319\n2,912\n441\n1,619\n321\n189\n189\n1,347\n81
\n236\n1,210\n19.2%\n307""", """Alcoa\n12,152\n9,153\n31\n227\n664\n390\n390\n2,039\n195\n19\n429\n3.5%\n190"""]
Yours columns
columns = ['Company', 'Sales', 'Cost of Sales', 'R&D', 'SG&A', 'Depr', 'Capex', 'Maint Capex', 'Op income', 'Int Exp.', 'Dividends', 'Net Inc.', 'Net Margin', 'Diluted Sh', 'rs']
Split the input with the separator
splitted_rows = [v.split("\n") for v in string_sock]
pd.DataFrame(splitted_rows, columns=columns).to_csv(test.csv)
Related
I am trying to use Beautiful Soup to read a value from a web page. The following steps are necessary:
go to the webpage:
url = 'https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings/funds/'
insert the ISIN in the searchbar
3. select the autocomplete-results from the container msci-ac-search-data-dropdown (click)
4. read the value from the "div class: ratingdata-outercircle esgratings-profile-header-green" to get the text: "ratingdata-fund-rating esg-fund-ratings-circle-aaa".
so far i have tried the following:
import requests
from bs4 import BeautifulSoup
isin = 'IE00B4L5Y983'
url = 'https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings/funds/'
soup = BeautifulSoup( requests.get(url).content, 'html.parser' )
payload = {}
for i in soup.select('form[action="https://www.msci.com/search"] input[value]'):
payload[i['name']] = i['value']
payload['UQ_txt'] = isin
Try:
import requests
from bs4 import BeautifulSoup
isin = "IE00B4L5Y983"
url = "https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0",
"X-Requested-With": "XMLHttpRequest",
}
params = {
"p_p_id": "esg_fund_ratings_profile",
"p_p_lifecycle": "2",
"p_p_state": "normal",
"p_p_mode": "view",
"p_p_resource_id": "searchFundRatingsProfiles",
"p_p_cacheability": "cacheLevelPage",
"_esg_fund_ratings_profile_keywords": isin,
}
data = requests.get(url, params=params, headers=headers).json()
params = {
"p_p_id": "esg_fund_ratings_profile",
"p_p_lifecycle": "2",
"p_p_state": "normal",
"p_p_mode": "view",
"p_p_resource_id": "showEsgFundRatingsProfile",
"p_p_cacheability": "cacheLevelPage",
"_esg_fund_ratings_profile_fundShareClassId": data[0]["url"],
}
headers["Referer"] = "https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings/funds/{}/{}".format(
data[0]["encodedTitle"], data[0]["url"]
)
soup = BeautifulSoup(
requests.get(url, params=params, headers=headers).content, "html.parser"
)
data = soup.select_one(".ratingdata-fund-rating")["class"]
print(data)
Prints:
['ratingdata-fund-rating', 'esg-fund-ratings-circle-aaa']
When you press enter, you send another request, which already shows the search result. Here is an example of how to get what you want
import requests
isin = 'IE00B4L5Y983'
url = f"https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings?p_p_id=esg_fund_ratings_profile&p_p_lifecycle=2&p_p_state=normal&p_p_mode=view&p_p_resource_id=searchFundRatingsProfiles&p_p_cacheability=cacheLevelPage&_esg_fund_ratings_profile_keywords={isin}"
for title in requests.get(url).json():
print(title['title'])
OUTPUT:
iShares Core MSCI World UCITS ETF USD (Acc)
If I may: from the OP's description I can only infer this is either an education related test, either a job interview related test. As such, following the exact instructions is paramount. In order to follow said instructions, you can only use selenium. The following code will work 'a la point', and get the desired result:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--headless")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
url = 'https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings/funds/'
browser.get(url)
WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.ID, '_esg_fund_ratings_profile_keywords'))).send_keys('IE00B4L5Y983')
WebDriverWait(browser, 20).until(EC.visibility_of_element_located((By.ID, 'ui-id-1')))
result = browser.find_element(By.ID, "ui-id-1")
result.click()
WebDriverWait(browser, 20).until(EC.visibility_of_element_located((By.CLASS_NAME, 'esgratings-profile-header-green')))
result = browser.find_element(By.CLASS_NAME, "esgratings-profile-header-green").find_element(By.TAG_NAME, "div").get_attribute('class')
print(result)
browser.quit()
This will return:
ratingdata-fund-rating esg-fund-ratings-circle-aaa
They will scrape only the data of 1page only not to move on second page is any solution then provide me have trying different approches but I am not successfull to solve these problem if any solution then provide me these is page URL https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx
import scrapy
from scrapy.http import Request
from selenium import webdriver
class TestSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//div[#class='list-group']//#href").extract()
for book in books:
url = response.urljoin(book)
if url.endswith('.ro') or url.endswith('.ro/'):
continue
yield Request(url, callback=self.parse_book)
def __init__(self):
self.driver = webdriver.Chrome(
'C:\Program Files (x86)\chromedriver.exe')
def parse_book(self, response):
self.driver.get(
"https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx")
title = response.xpath(
"//span[#id='HeadingContent_lblTitle']//text()").get()
d1 = response.xpath("//div[#class='col-md-10']//p[1]//text()").get()
d1 = d1.strip()
d2 = response.xpath("//div[#class='col-md-10']//p[2]//text()").get()
d2 = d2.strip()
d3 = response.xpath(
"//div[#class='col-md-10']//p[3]//span//text()").get()
d3 = d3.strip()
d4 = response.xpath("//div[#class='col-md-10']//p[4]//text()").get()
d4 = d4.strip()
yield{
"title1": title,
"title2": d1,
"title3": d2,
"title4": d3,
"title5": d4,
}
max_page_el = WebDriverWait(driver, 10).until(
EC.visibility_of_element_located((By.ID, "MainContent_PagerTop_lblPages"))
)
max_page = int(max_page_el.text.split("din").pop().split(")")[0])
# test with smaller number
for i in range(1, 4):
WebDriverWait(driver, 10).until(
EC.visibility_of_element_located((By.ID, f"MainContent_PagerTop_NavToPage{i}"))
).click()
# scrap here
elements = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, f"list-group"))
)
# just an example
print(elements)
driver.quit()
They give me output of 1 page:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# Setup your driver, open the page, etc.
max_page_el = WebDriverWait(driver, 10).until(
EC.visibility_of_element_located((By.ID, "MainContent_PagerTop_lblPages"))
)
# use can use regex here
max_page = int(max_page_el.text.split("din").pop().split(")")[0])
# test with smaller number
for i in range(1, max_page):
WebDriverWait(driver, 10).until(
EC.visibility_of_element_located((By.ID, f"MainContent_PagerTop_NavToPage{i}"))
).click()
# scrap here
elements = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, f"list-group"))
)
# just an example
print(elements)
driver.quit()
I'm trying to retrieve stock information from 2 different urls and write the information to a panda's dataframework. However, i keep on getting errors. Could anyone please help me out here?
I'm pretty new to python, so my code will probably look very ugly :D
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers= {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'max-age=0'
}
PATH='C:\Program Files (x86)\chromedriver.exe'
options = Options()
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument("--window-size=2550,1440")
s = Service('C:\Program Files (x86)\chromedriver.exe')
driver = webdriver.Chrome(PATH, options=options)
driver.implicitly_wait(10)
#maak een dataframe aan
dn=[]
def accept_cookies():
try:
driver.find_element(By.ID, 'accept-choices').click()
except:
print('fu')
stocklist=["FB","KLIC"]
for x in stocklist:
url = f"https://stockanalysis.com/stocks/{x}/financials/"
driver.get(url)
driver.implicitly_wait(10)
accept_cookies()
driver.implicitly_wait(10)
driver.find_element(By.XPATH, "//span[text()='Quarterly']").click()
xlwriter = pd.ExcelWriter(f'financial statements1.xlsx', engine='xlsxwriter')
soup = BeautifulSoup(driver.page_source, 'html.parser')
df = pd.read_html(str(soup), attrs={'id': 'financial-table'})[0]
new_df = pd.concat(df)
dn.to_excel(xlwriter, sheet_name='key', index=False)
xlwriter.save()
pd.concat needs a list of objects to concatenate, whereas you have only given it df.
So I think replace pd.concat(df) with pd.concat([df, new_df]) and have new_df = pd.DataFrame() before the for loop.
In case that there is no issue with the .read_html() part you should push your df to a list of data frames:
dflist =[]
for x in stocklist:
url = f"https://stockanalysis.com/stocks/{x}/financials/"
driver.get(url)
driver.implicitly_wait(10)
accept_cookies()
driver.implicitly_wait(10)
driver.find_element(By.XPATH, "//span[text()='Quarterly']").click()
soup = BeautifulSoup(driver.page_source, 'html.parser')
dflist.append(pd.read_html(str(soup), attrs={'id': 'financial-table'})[0])
Finishing the iteration you can simply concat the list of data frames to a single one:
xlwriter = pd.ExcelWriter(f'financial statements1.xlsx', engine='xlsxwriter')
pd.concat(dflist).to_excel(xlwriter, sheet_name='key', index=False)
xlwriter.save()
example
dflist =[]
for x in stocklist:
url = f"https://stockanalysis.com/stocks/{x}/financials/"
driver.get(url)
driver.implicitly_wait(10)
accept_cookies()
driver.implicitly_wait(10)
driver.find_element(By.XPATH, "//span[text()='Quarterly']").click()
soup = BeautifulSoup(driver.page_source, 'html.parser')
dflist.append(pd.read_html(str(soup), attrs={'id': 'financial-table'})[0])
xlwriter = pd.ExcelWriter(f'financial statements1.xlsx', engine='xlsxwriter')
pd.concat(dflist).to_excel(xlwriter, sheet_name='key', index=False)
xlwriter.save()
I'm struggling to get this code to extract the desired information from one single page.
I've tried all the usual selenium tactics and added a time delay. Hopefully, it's something simple. I'm not getting any error messages.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup as bs
from time import sleep
options = Options()
options.add_argument("--headless")
options.add_argument("window-size=1400,600")
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_0_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36"
options.add_argument(f'user-agent={user_agent}')
capabilities = { 'chromeOptions': { 'useAutomationExtension': False},'args': ['--disable-extensions']}
browser = webdriver.Chrome(executable_path=r'/usr/local/bin/chromedriver',desired_capabilities = capabilities,options=options)
url='https://groceries.asda.com/product/celery-spring-onions/asda-growers-selection-trimmed-spring-onions/41676'
browser.get(url)
sleep(3)
source_data = browser.page_source
bs_data = bs(source_data,"html.parser")
#product id
try:
product_id = bs_data.findfindAll('span', {'class': 'pdp-main-details__product-code'})
product_id = product_id.replace('Product code:','').strip()
except:
product_id = "n/a"
#image address
try:
for image in bs_data.find("div", {"class":"s7staticimage"}):
image_url = image.find('img')['src']
except:
image_url = "n/a"
#product description
try:
product_desc = bs_data.find('class',{'pdp-main-pdp-main-details__title'})
product_desc = product_desc.get_text().strip()
except:
product_desc = "n/a"
#product price
try:
product_price = bs_data.find('class',{'co-product__price pdp-main-details__price'})
product_price = product_price.get_text().strip()
except:
product_price = "n/a"
print (url,'|',image_url,'|',product_id,'|',product_desc,'|',product_price)
browser.quit()
Any assistance is greatly appreciated.
Thanks
Since the content is dynamically generated, your soup has nothing in it to find. Selenium is good enough. I don't know why you have treated the elements as list because there is only one of each on this page.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument('--headless')
capabilities = { 'chromeOptions': { 'useAutomationExtension': False},'args': ['--disable-extensions']}
browser = webdriver.Chrome(executable_path='C:/bin/chromedriver.exe',desired_capabilities = capabilities,options=options)
url='https://groceries.asda.com/product/celery-spring-onions/asda-growers-selection-trimmed-spring-onions/41676'
browser.get(url)
browser.implicitly_wait(15)
product_id = browser.find_element_by_class_name('pdp-main-details__product-code')
print(product_id.text)
image = browser.find_element_by_xpath("//*[#id=\"s7viewer_flyout\"]/div[1]/img[1]")
image_url = image.get_attribute('src')
print(image_url)
Output:-
Product code: 410212
https://ui.assets-asda.com/dm/asdagroceries/5050854288142_T1?defaultImage=asdagroceries/noImage&resMode=sharp2&id=PqaST3&fmt=jpg&fit=constrain,1&wid=188&hei=188
I want to get first 10 images url from google search (not base64).
I have code:
import os
import base64
import time
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
searchterm = 'bananas' # will also be the name of the folder
url = "https://www.google.com/search?q=banan&source=lnms&tbm=isch&sa=X&ved=2ahUKEwj-75rDlJLoAhWLHHcKHStFC6EQ_AUoAXoECA4QAw&biw=1867&bih=951"
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
browser = webdriver.Chrome(executable_path=ChromeDriverManager().install(), options=options)
browser.get(url)
actions = webdriver.common.action_chains.ActionChains(browser)
header = {
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"}
counter = 0
succounter = 0
if not os.path.exists(searchterm):
os.mkdir(searchterm)
for i in range(0, 11):
time.sleep(1)
x = browser.find_elements_by_xpath('//*[#id="islrg"]/descendant::img')[i]
x.click()
i += 1
if i > 10:
break
ba = browser.find_element_by_xpath('//*
[#id="Sva75c"]/div/div/div[3]/div[2]/div/div[1]/div[1]/div/div[2]/a/img')
print(ba.get_attribute('src'))
It returns image urls, but sometimes base64. How to make the script always return image url?
Thank you.
Change the xpath to get the link rather image, and then get the href.
ba = browser.find_element_by_xpath("//div[#class='islrc']//a[#href][#rel='noopener']")
print(ba.get_attribute("href")
You can always get only Image URLs if you scrape another search engine DuckDuckGo using the following code:
search_query = 'what you want to find'
num_images = 10
driver_location = '/put/location/of/your/driver/here'
# setting up the driver
ser = Service(driver_location)
op = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=ser, options=op)
# searching the query
driver.get(f'https://duckduckgo.com/?q={search_query}&kl=us-en&ia=web')
# going to Images Section
ba = driver.find_element(By.XPATH, "//a[#class='zcm__link js-zci-link js-zci-link--images']")
ba.click()
# getting the images URLs
for result in driver.find_elements(By.CSS_SELECTOR, '.js-images-link')[0:0+num_images]:
imageURL = result.get_attribute('data-id')
print(f'{imageURL}\n')
driver.quit()