I want to create a program using Python where I can automatically read keyword from Excel sheet and search them into the google search bar. Then calculate the longest suggestion and shortest suggestion and store them into the same Excel sheet. I am facing some issues when I run my program.
Here is my code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from datetime import datetime
import openpyxl
from selenium.webdriver.common.by import By
import os
# open existing Excel sheet
workbook = openpyxl.load_workbook('Excel/Excel.xlsx')
sheet = workbook.active
# get the number of rows in the sheet
row_count = sheet.max_row
# create webdriver instance
driver = webdriver.Chrome()
# loop through the rows of the Excel sheet
for i in range(1, row_count + 1):
# read keyword from Excel sheet
keyword = sheet.cell(i, 3).value
# navigate to Google Search page
driver.get("https://www.google.com")
# find search box element and enter search query
search_box = driver.find_element(By.NAME, "q")
search_box.send_keys(keyword)
search_box.send_keys(Keys.RETURN)
# extract all suggested web pages
suggestions = driver.find_elements(By.CLASS_NAME, "wM6W7d")
suggestions_text = [sugg.text for sugg in suggestions]
# find the longest and shortest suggestion
longest_suggestion = max(suggestions_text, key=len)
shortest_suggestion = min(suggestions_text, key=len)
# get current date and time
now = datetime.now()
date_time = now.strftime("%Y-%m-%d %H:%M:%S")
# write date and time to specific cell in the Excel sheet
sheet.cell(i, 3).value = date_time
sheet.cell(i, 2).value = keyword
sheet.cell(i, 4).value = longest_suggestion
sheet.cell(i, 5).value = shortest_suggestion
workbook.save('Excel/Excel.xlsx')
# close webdriver
driver.close()
Image:
Error:
C:\Users\DELL\PycharmProjects\SeleniumTest\venv\Scripts\python.exe C:/Users/DELL/PycharmProjects/SeleniumTest/multiple.py
Traceback (most recent call last):
File "C:\Users\DELL\PycharmProjects\SeleniumTest\multiple.py", line 28, in <module>
search_box.send_keys(keyword)
File "C:\Users\DELL\PycharmProjects\SeleniumTest\venv\Lib\site-packages\selenium\webdriver\remote\webelement.py", line 234, in send_keys
Command.SEND_KEYS_TO_ELEMENT, {"text": "".join(keys_to_typing(value)), "value": keys_to_typing(value)}
^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\DELL\PycharmProjects\SeleniumTest\venv\Lib\site-packages\selenium\webdriver\common\utils.py", line 148, in keys_to_typing
for i in range(len(val)):
^^^^^^^^
TypeError: object of type 'NoneType' has no len()
Process finished with exit code 1
Related
I created a script to pull data from a website and paste it in excel. The script works, however when I extract the Name from the website and it paste in excel for some reason its very small. But when you click in the cell its normal size. See image here.
I tried to show a snip it as an example but stack won't allow me.
Here is the current code
from openpyxl import load_workbook
from bs4 import BeautifulSoup
import requests
# Fetch the HTML page
url = 'https://esearch.mobilecopropertytax.com/Property/View/466089'
response = requests.get(url)
html = response.text
# Parse the HTML page
soup = BeautifulSoup(html, 'lxml')
# Find the element containing the Parcel Number
element = soup.find('th', text='Parcel Number:')
# Extract the Parcel Number from the element
parcel_number = element.find_next_sibling().text
# Find the element containing the Name
element = soup.find('th', text='Name:')
# Extract the Name from the element
name = element.find_next_sibling().text
# Load the workbook
wb = load_workbook(r'C:\Users\user\EJW Test\EJWtest.xlsx')
# Select the sheet
ws = wb['Justification Worksheet']
# Select the cells
cell1 = ws['C7'] # Parcel Number
cell2 = ws['C10'] # Name
# Set the values of the cells
cell1.value = parcel_number
cell2.value = name
# Save the workbook
wb.save('completetest.xlsx')
Unsure why it puts the parcel info in just fine but inserts the name into excel and it looks completely different.
Now I am having an issue where the next page is not loading completely for me to take the information down fully and the script is coming up with a NoMatchingElement error halfway through. I tried adding a WebDriverWait as shown below in the code, but it is not working. Any help is greatly appreciated. Thank you!
import sys
import csv
from selenium import webdriver
import time
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# default path to file to store data
path_to_file = "/Users/Dan/Desktop/reviews.csv"
# default number of scraped pages
num_page = 1939
# default tripadvisor website of hotel or things to do (attraction/monument)
url = "https://www.tripadvisor.com/Attraction_Review-g187791-d192285-Reviews-Colosseum-Rome_Lazio.html"
# if you pass the inputs in the command line
if (len(sys.argv) == 4):
path_to_file = sys.argv[1]
num_page = int(sys.argv[2])
url = sys.argv[3]
# import the webdriver
driver = webdriver.Safari()
driver.get(url)
# open the file to save the review
csvFile = open(path_to_file, 'a', encoding="utf-8")
csvWriter = csv.writer(csvFile)
# change the value inside the range to save more or less reviews
data=[]
for i in range(0, num_page):
# expand the review
wait = WebDriverWait(driver, 10)
element = wait.until(EC.element_to_be_clickable((By.XPATH, "//*[#id='tabs-1']/div[2]/a[#accesskey='n']")))
container = driver.find_elements_by_xpath("//*[#id='tabs-1']/div[3]/table/tbody")
for con in container:
name = con.find_element_by_xpath(".//tr[2]/td").text.replace("Sponsor Name:","")
start = con.find_element_by_xpath(".//tr[1]/td[3]").text.replace("Start Date*: ","")
data.append((name,start))
df = pd.DataFrame(data,columns=['Name','Start'])
df.to_csv('/Users/Dan/Desktop/reviews.csv', index = False)
driver.find_element_by_xpath("//*[#id='tabs-1']/div[2]/a[#accesskey='n']").click()
driver.quit()
I am not sure how to do it with csv writer but I know that you can do it like so. Just append to 2d list and then write to csv file.
driver.get("https://www.clinicaltrialsregister.eu/ctr-search/search?query=")
container = driver.find_elements_by_xpath("//*[#id='tabs-1']/div[3]/table/tbody")
data=[]
for con in container:
name = con.find_element_by_xpath(".//tr[2]/td").text
start = con.find_element_by_xpath(".//tr[1]/td[3]").text
data.append((name,start))
df = pd.DataFrame(data,columns=['Name','Start'])
#print(df)
df.to_csv('/Users/Dan/Desktop/reviews.csv', index = False)
Import
import pandas as pd
I'm trying to save the data from a profile on Google Scholar into a CSV. The profile has a ‘Show More’ button, and I can get all the data from it (here I only saved the data from the table but I need all the data from the profile) but the problem is that I saved the data twice or even more times sometimes, and I think it's because I saved it while I was clicking and not after I had clicked all the ‘Show More’, so how can I do that? Also, here I used only one URL, but there are more, and I have them saved in another CSV, so how do I open each URL from there to do what I do here? (I only need the Link row) the CSV with the URLs look like this
https://drive.google.com/file/d/1zkTlzYaOQ7FVoSdd5OMnE8QgwS8NOik7/view?usp=sharing
from selenium.webdriver.support.ui import WebDriverWait as W
from selenium.webdriver.support import expected_conditions as EC
from selenium.common import exceptions as SE
from selenium import webdriver
import time
from csv import writer
chrome_path=r"C:\Users\gvste\Desktop\proyecto\chromedriver.exe"
driver = webdriver.Chrome(chrome_path)
urls = ["https://scholar.google.com/citations?hl=en&user=gQb_tFMAAAAJ"]
button_locators = "//button[#class='gs_btnPD gs_in_ib gs_btn_flat gs_btn_lrge gs_btn_lsu']"
wait_time = 2
wait = W(driver, wait_time)
for url in urls:
data = {}
driver.get(url)
button_link = wait.until(EC.element_to_be_clickable((By.XPATH, button_locators)))
while button_link:
try:
wait.until(EC.visibility_of_element_located((By.ID,'gsc_a_tw')))
data = driver.find_elements_by_class_name("gsc_a_tr")
button_link = wait.until(EC.element_to_be_clickable((By.XPATH, button_locators)))
button_link.click()
time.sleep(2)
with open('perfil.csv','a', encoding="utf-8", newline='') as s:
csv_writer =writer(s)
for i in range(len(data)):
paper = driver.find_elements_by_class_name("gsc_a_t")
citas = driver.find_elements_by_class_name("gsc_a_c")
año = driver.find_elements_by_class_name("gsc_a_y")
p = paper[i].text.replace(',', '')
c = citas[i].text.replace(',', '')
a = año[i].text.replace(',', '')
csv_writer.writerow([ p, c, a])
except SE.TimeoutException:
print(f'Página parseada {url}')
break
driver.quit()
For the first part I didn't really get what's happening. But for the second part you can change URLs from hard code to a function (put the loop in function) and you can use pandas library for CSV (it's much better). This is for getting the URLS
import pandas as pd
df = pd.read_csv(csv_file)
urls = df['column_name']
Here is the most basic way read data from CSV file:
import csv
with open('filename.csv', 'r') as file:
reader = csv.reader(filename)
for row in reader:
print(row)
I have a number of HTML files that I need to open up or import into a single Excel Workbook and simply save the Workbook. Each HTML file should be on its own Worksheet inside the Workbook.
My existing code does not work and it crashes on the workbook.Open(html) line and probably will on following lines. I can't find anything searching the web specific to this topic.
import win32com.client as win32
import pathlib as path
def save_html_files_to_worksheets(read_directory):
read_path = path.Path(read_directory)
save_path = read_path.joinpath('Single_Workbook_Containing_HTML_Files.xlsx')
excel_app = win32.gencache.EnsureDispatch('Excel.Application')
workbook = excel_app.Workbooks.Add() # create a new excel workbook
indx = 1 # used to add new worksheets dependent on number of html files
for html in read_path.glob('*.html'): # loop through directory getting html files
workbook.Open(html) # open the html in the newly created workbook - this doesn't work though
worksheet = workbook.Worksheets(indx) # each iteration in loop add new worksheet
worksheet.Name = 'Test' + str(indx) # name added worksheets
indx += 1
workbook.SaveAs(str(save_path), 51) # win32com requires string like path, 51 is xlsx extension
excel_app.Application.Quit()
save_html_files_to_worksheets(r'C:\Users\<UserName>\Desktop\HTML_FOLDER')
The following code does half of want I want, if this helps. It will convert each HTML file into a separate Excel file. I need each HTML file in one Excel file with multiple WorkSheets.
import win32com.client as win32
import pathlib as path
def save_as_xlsx(read_directory):
read_path = path.Path(read_directory)
excel_app = win32.gencache.EnsureDispatch('Excel.Application')
for html in read_path.glob('*.html'):
save_path = read_path.joinpath(html.stem + '.xlsx')
wb = excel_app.Workbooks.Open(html)
wb.SaveAs(str(save_path), 51)
excel_app.Application.Quit()
save_as_xlsx(r'C:\Users\<UserName>\Desktop\HTML_FOLDER')
Here is a link to a sample HTML file you can use, the data in the file is not real: HTML Download Link
One solution would be to open the HTML file into a temporary workbook, and copy the sheet from there into the workbook containing all of them:
workbook = excel_app.Application.Workbooks.Add()
sheet = workbook.Sheets(1)
for path in read_path.glob('*.html'):
workbook_tmp = excel_app.Application.Workbooks.Open(path)
workbook_tmp.Sheets(1).Copy(Before=sheet)
workbook_tmp.Close()
# Remove the redundant 'Sheet1'
excel_app.Application.ShowAlerts = False
sheet.Delete()
excel_app.Application.ShowAlerts = True
I believe pandas will make your job much easier.
pip install pandas
Here's an example on how to get multiple tables from a wikipedia html and input it into a Pandas DataFrame and save it to disk.
import pandas as pd
url = "https://en.wikipedia.org/wiki/List_of_American_films_of_2017"
wikitables = pd.read_html(url, header=0, attrs={"class":"wikitable"})
for idx,df in enumerate(wikitables):
df.to_csv('{}.csv'.format(idx),index=False)
For your use case, something like this should work:
import pathlib as path
import pandas as pd
def save_as_xlsx(read_directory):
read_path = path.Path(read_directory)
for html in read_path.glob('*.html'):
save_path = read_path.joinpath(html.stem + '.xlsx')
dfs_from_html = pd.read_html(html, header=0,)
for idx, df in enumerate(dfs_from_html):
df.to_excel('{}.xlsx'.format(idx),index=False)
** Make sure to set the correct html attribute in the pd.read_html function.
How about this?
Sub From_XML_To_XL()
'UpdatebyKutoolsforExcel20151214
Dim xWb As Workbook
Dim xSWb As Workbook
Dim xStrPath As String
Dim xFileDialog As FileDialog
Dim xFile As String
Dim xCount As Long
On Error GoTo ErrHandler
Set xFileDialog = Application.FileDialog(msoFileDialogFolderPicker)
xFileDialog.AllowMultiSelect = False
xFileDialog.Title = "Select a folder [Kutools for Excel]"
If xFileDialog.Show = -1 Then
xStrPath = xFileDialog.SelectedItems(1)
End If
If xStrPath = "" Then Exit Sub
Application.ScreenUpdating = False
Set xSWb = ThisWorkbook
xCount = 1
xFile = Dir(xStrPath & "\*.xml")
Do While xFile <> ""
Set xWb = Workbooks.OpenXML(xStrPath & "\" & xFile)
xWb.Sheets(1).UsedRange.Copy xSWb.Sheets(1).Cells(xCount, 1)
xWb.Close False
xCount = xSWb.Sheets(1).UsedRange.Rows.Count + 2
xFile = Dir()
Loop
Application.ScreenUpdating = True
xSWb.Save
Exit Sub
ErrHandler:
MsgBox "no files xml", , "Kutools for Excel"
End Sub
First of all, I am new to python (practically I have learned only from Sololearn, that too only up to half course). So I request you to give me a little bit detailed answer.
My task has following broad steps:-
Delete old .xlsx file(if any)
Convert two .xls files into .xlsx file using win32, delete the first row and then delete .xls file [weird xls files already downloaded into source directory + xlrd,pyexcel show error (unsupported format or corrupt) file in opening .xls file (online analysis of file predicts it to be html/htm) ]
Get data from xlsx file
First, delete old worksheet on google spreadsheet to remove old data. Create a new worksheet with the same name. Insert data into new worksheet on the google spreadsheet.
Open second sheet(which imports data from the first sheet) and update one cell in Dummy Sheet to make sure google spreadsheet is synchronised in the background.
Now, I wrote a code by combining many codes and by using a lot of google.
The code is working fine but it takes on an avg about 65 seconds to complete the whole process.
My question has 3 parts:-
Is there any way to directly access data from .xls file?
Is there any way this code's performance can be improved.
Any other more efficient method for completing the above-said task?
My Code:-
import time
import win32com.client as win32
import os
import openpyxl
from openpyxl.utils import get_column_letter
import gspread
from oauth2client.service_account import ServiceAccountCredentials
start = time.time()
# set input-output file locations
source_dir = "C:\\Users\\XYZ\\Downloads"
output_dir = "C:\\Users\\XYZ\\Excels"
# use creds to create a client to interact with the Google Drive API
# make sure to share files with email contained in json file
scope = ['https://spreadsheets.google.com/feeds']
# code will not work without json file
creds = ServiceAccountCredentials.from_json_keyfile_name("C:\\Users\\XYZ\\your.json", scope)
gc = gspread.authorize(creds)
# following code is to open any spreadsheet by name
sh = gc.open("First Sheet")
def save_as_xlsx(input_file,output_dir,output_file_name) :
# call excel using win32, then open .xls file
# delete first row and then save as .xlsx
excel = win32.gencache.EnsureDispatch('Excel.Application')
wb = excel.Workbooks.Open(input_file)
wbk = excel.ActiveWorkbook
sheet = wbk.Sheets(1)
sheet.Rows(1).Delete()
wb.SaveAs(output_dir + '\\' + output_file_name, FileFormat = 51)
#FileFormat = 51 is for .xlsx extension. FileFormat = 56 is for .xls extension
wb.Close()
excel.Application.Quit()
return True
def get_the_data_from_xlsx(output_dir,output_file_name) :
# use openpyxl.load to find out last cell of file
# store cell values in list called data
wb = openpyxl.load_workbook(output_dir + '\\' + output_file_name)
sheet = wb.active
max_row_no = sheet.max_row
max_column_no = sheet.max_column
max_column = get_column_letter(max_column_no)
last_cell = str(max_column) + str(max_row_no)
cell_addresses = sheet['A1' : last_cell]
data = []
for i in cell_addresses :
for e in i :
data.append(e.value)
return (data,last_cell)
def insert_data_into_spreadsheet(name_of_worksheet,data,last_cell) :
# Find a workbook by name in already opened spreadsheet
# delete the worksheet to clear old data
# create worksheet with same name to maintain import connections in sheets.
worksheet = sh.worksheet(name_of_worksheet)
sh.del_worksheet(worksheet)
worksheet = sh.add_worksheet(title=name_of_worksheet, rows="500", cols="30")
# store range of cells for spreadsheet in list named cell_list
cell_list = worksheet.range('A1' + ':' + str(last_cell))
# attach all the values from data list as per the cell_list
a = 0
for cell in cell_list :
cell.value = data[a]
a = a + 1
# update all cells stored in cell_list in one go
worksheet.update_cells(cell_list)
def delete_file(directory,file_initials) :
for filename in os.listdir(directory) :
if filename.startswith(file_initials) :
os.unlink(directory +"\\" + filename)
# check if files are in source_dir
for filename in os.listdir(source_dir) :
# check for file1.xls and set input_file name if any file exists.
if filename.startswith("file1"):
input_file = source_dir + "\\file1.xls"
output_file1 = "output_file1.xlsx"
# detect and delete any old file in output directory
delete_file(output_dir,"output_file1")
if save_as_xlsx(input_file,output_dir,output_file1) == True :
# delete the file from source directory after work is done
delete_file(source_dir,'file1')
# get data from new xlsx file
data_from_xlsx = get_the_data_from_xlsx(output_dir,output_file1)
data_to_spreadsheet = data_from_xlsx[0]
last_cell = data_from_xlsx[1]
# insert updated data into spreadsheet
insert_data_into_spreadsheet("file1_data",data_to_spreadsheet,last_cell)
# repeat the same process for 2nd file
if filename.startswith('file2'):
input_file = source_dir + "\\file2.xls"
output_file2 = "output_file2.xlsx"
delete_file(output_dir,"output_file2")
if save_as_xlsx(input_file,output_dir,output_file2) == True :
delete_file(source_dir,'file2')
data_from_xlsx = get_the_data_from_xlsx(output_dir,output_file2)
data_to_spreadsheet = data_from_xlsx[0]
last_cell = data_from_xlsx[1]
insert_data_into_spreadsheet("file2_data",data_to_spreadsheet,last_cell)
# open spreadsheet by name and open Dummy worksheet
# update one cell to sync the sheet with other sheets
sh = gc.open("second sheet")
worksheet = sh.worksheet("Dummy")
worksheet.update_acell('B1', '=Today()')
end = time.time()
print(end-start)