I recently updated the Selenium Web Driver for one of the old apps because it wasn't working.
Now, it seems like the driver is functional, however, the script doesn't fully do what it is supposed to. Please help, perhaps there is an error with logic inside main function but as mentioned, this used to work before.
Essentially, I have switcher.py app that swaps three different browser tabs with a certain specified delay. I specify the urls in urls.txt where two addresses are just links and the other one is file://srv13nas1/departmental/3_Operations/350_Assembly/department/projects/PLL01_Ramp-up/08_Visualization/Presentations/ which contains all of the PNGs of the user. Switcher.py is supposed to pick a presentation png based on the Serial number value for the WorkCenter which we get from either one of two csv files. If no serial number was found, it would default to the default image with our company's logo. Check main function to see where it supposed to happen.
For some reason, this last step with defaulting to a certain image stopped working and I am not sure what fixes to apply. It has a strange behavior: it just says "File not found". I know that the file with the appropriate name is not in the specified directory but it had to default to default image in that case.
Here is the source code for the switcher.py:
from os.path import join , dirname
from datetime import datetime , timedelta
from selenium import webdriver
from time import sleep
import pandas as pd
import sys
ROOT = dirname(__file__)
PATH_P = join(r"\\srvditz1\DataXchange_R3\TUS_EXCHANGE", "ZPP00141_TUS350.csv")#join("data", "zpp00141_all.csv")
PATH_E = join(r"\\srvditz1\DataXchange_R3\TUS_EXCHANGE", "ZPP00138_TUS350.csv")#join("data", "sample_data1.csv")
DEFAULT_IMAGE = r'file://srv13nas1/departmental/3_Operations/350_Assembly/department/projects/PLL01_Ramp-up/08_Visualization/Presentations/Push_Slide_1.png'
DRIVER_PATH = join(
ROOT ,
"resources" ,
"msedgedriver.exe" ,
)
URLS_FILE = sys.argv[1] if len(sys.argv) == 2 else "urls.txt"
URLS_PATH = join(
ROOT ,
URLS_FILE ,
)
URLS = [x.strip() for x in open(URLS_PATH).readlines() if x.strip() != ""]
IS_LOOP = True
#Specify the delay value in seconds
DELAY = 20
def fix_urls (
):
wc = URLS[0].split('=')[-1]
#Read dfe(executed)
dfe = pd.read_csv(PATH_E, sep=";")
#Filter Final confirmation
filt = (dfe['Final Confirmation'] == 'X')
dfe = dfe.loc[filt]
#Read dfp(planed)
dfp = pd.read_csv(PATH_P, sep=";")
#Try to find the wc in the planned dataset, if not, then try the current dataset
try :
x = dfp[dfp["Work Center"]==wc]["Serial Number"].values[0]
except :
x = dfe[dfe["Work Ctr"]==wc]["Goods recipient"].values[0]
URLS[-1] = URLS[-1] + x + ".png"
def g_driver (
):
#Create a selenium webdriver
driver = webdriver.Edge(DRIVER_PATH)
#For each url, create an empty tab
for i in range(len(URLS)-1) :
driver.execute_script("window.open('');")
return driver
def wait (
#Waits until the n th second of a minute
n ,
):
#Get the previous refresh time
last_time = datetime.now()
last_time = last_time.replace(second= last_time.second - last_time.second % n)
future_time = last_time + timedelta(seconds= n)
diff = future_time - datetime.now()
sleep(diff.total_seconds())
def switch (
#
driver ,
):
#Keep running
while IS_LOOP :
#For each url
for i in range(len(URLS)) :
#switch to the tab
driver.switch_to.window(driver.window_handles[i])
#Refresh the url
driver.get(URLS[i])
#Stay on the page for a while
wait(DELAY)
driver.delete_all_cookies()
def main():
#Get the png path
try :
fix_urls()
except:
URLS[-1] = DEFAULT_IMAGE
#Create the browser instance
driver = g_driver()
#Start switching tabs
switch(driver)
if __name__ == "__main__" :
main()
Related
First of all I should inform you that I have very little experience in programming. And I have some trouble with the logic and flow of a general webscraper implemented in python. I assume that I should use callbacks and similar methods in order to properly control the process of saving pages from a javascript e-book reader. My script does work, but not consistently. If someone could advice me on improvements that should be made to this script, that would be great. Thank you.
from seleniumwire.utils import decode as sdecode
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options # [!]
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
import os.path
opts = Options() # [!]
opts.add_experimental_option('w3c', True) # [!]
capabilities = DesiredCapabilities.CHROME.copy()
driver = webdriver.Chrome(chrome_options=opts, desired_capabilities=capabilities)
url = ' here comes url'
driver.get(url)
def get_requests():
l = []
for rx in driver.requests:
#endmark = '&scale=2&rotate=0' lenght must be 17
if rx.url[-17:]==endmark:
l.append(rx.url)
return list(set(l))
def savepages(diff):
newpages = 0
for urlitem in diff:
for request in driver.requests:
if request.url==urlitem:
#print(request.url)
ind = urlitem.find('.jp2&id') # ex. 0012.jp2&id
file_path = directory_path + '\\' + file_name + urlitem[ind-4:ind] + '.jpg'
tik = 0
while tik<10: #waiting for the response body data
try:
tik += 1
data = sdecode(request.response.body, request.response.headers.get('Content-Encoding', 'identity'))
except AttributeError: # no data error
time.sleep(2) # wait for 2 sec for the data
continue
#data = data.decode("utf-8",'ignore')
# sometimes I get this error 'UnboundLocalError: local variable 'data' referenced before assignment'
# I assumed that the following condition will help but it doesn't seem to work consistently
if data:
with open(file_path, 'wb') as outfile:
outfile.write(data) # sometimes I get UnboundLocalError
else: print('no data')
# was the file saved or not
if os.path.exists(file_path):
newpages += 1 # smth is wrong with the counting logic, since pages+newpages should be equal to the lenght of li=get_requests(), I get more
else:
time.sleep(.5)
return newpages
count = 0 # a counter, should terminate the main delay loop
pages = 0 # counting all saved pages; book pages or images are equivalent, one turn should open 2 new pages/images/requests
oldli = [] #compare to the new list after each delay cycle
turns = 0 #count how many turns have been made or how many times we clicked on the button Next Page
li = get_requests() # get all unique requests of the images/pages, some requests might be still loading, but we manually opened the first page and visually confirmed that there are at least 1 or 3 images/requests
if li: # the program STARTS HERE, first try, there are some requests because we manually opened the first page
# THE MAIN CYCLE should stop when the delay is too long and we turned all the pages of the book
while 2*turns+1<len(li) or count<15: # should terminate the whole program when there is no more images coming
count = 0 #reset counter
success = False #reset success; new pages downloaded successfully
# the main delay counter
# what happens if diff is [] and no success
while True:
count += 1
if count > 14:
print('Time out after more than 10 seconds.')
break
li = get_requests() # in addition, I assume that all requests counting from page 1 will be kept
# it is possible that li will not have some of the old requests and oldli will be longer
# well, I need to keep all old requests in a separate list and then append to it
diff = list(set(li)-set(oldli)) # find new requests after the delay
if diff: # there are some new
npages = savepages(diff) # saves new images and returns the number of them
print('newpages ',npages, ' len diff ', len(diff)) # should be equal
if npages >= len(diff)-1: # we allow one request without a body with data ??
pages += npages # smth is not ok here, the number of pages sometimes exceeds the length of li
success = True # we call it a success
else:
print('Could not save pages. Newpages ', npages, ' len diff ', len(diff))
for pg in diff:
print(pg) # for debuging purposes
break # in this case you break from the delay cycle
else: time.sleep(2) # if no new requests add 2 sec to the waiting time
if success: # we turn pages in case of successful download, this is bad if we need to catch up
while 2*turns+1 < len(li): # if some of old requests are deleted then the program will stop earlier
# it won't wait for the bodies of requests, there is a problem
driver.find_elements(By.CLASS_NAME, "BRicon.book_right.book_flip_next")[0].click()
turns += 1
time.sleep(3) # I got the impression that this doesn't happen
oldli = li
print('pages ',pages,' length of list ',len(li))
break # we break from the delay cycle since success
time.sleep(2) # the main delay timer;; plus no diff timer = total time
else: print('no requests in the list to process') ```
I am a student working on a scraping project and I am having trouble completing my script because it fills my computer's memory with all of the data is stores.
It currently stores all of my data until the end, so my solution to this would be to break up the scrape into smaller bits and then write out the data periodically so it does not just continue to make one big list and then write out at the end.
In order to do this, I would need to stop my scroll method, scrape the loaded profiles, write out the data that I have collected, and then repeat this process without duplicating my data. It would be appreciated if someone could show me how to do this. Thank you for your help :)
Here's my current code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
from selenium.common.exceptions import NoSuchElementException
Data = []
driver = webdriver.Chrome()
driver.get("https://directory.bcsp.org/")
count = int(input("Number of Pages to Scrape: "))
body = driver.find_element_by_xpath("//body")
profile_count = driver.find_elements_by_xpath("//div[#align='right']/a")
while len(profile_count) < count: # Get links up to "count"
body.send_keys(Keys.END)
sleep(1)
profile_count = driver.find_elements_by_xpath("//div[#align='right']/a")
for link in profile_count: # Calling up links
temp = link.get_attribute('href') # temp for
driver.execute_script("window.open('');") # open new tab
driver.switch_to.window(driver.window_handles[1]) # focus new tab
driver.get(temp)
# scrape code
Name = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[1]/div[2]/div').text
IssuedBy = "Board of Certified Safety Professionals"
CertificationorDesignaationNumber = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[1]/td[3]/div[2]').text
CertfiedorDesignatedSince = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[3]/td[1]/div[2]').text
try:
AccreditedBy = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[3]/div[2]/a').text
except NoSuchElementException:
AccreditedBy = "N/A"
try:
Expires = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[1]/div[2]').text
except NoSuchElementException:
Expires = "N/A"
info = Name, IssuedBy, CertificationorDesignaationNumber, CertfiedorDesignatedSince, AccreditedBy, Expires + "\n"
Data.extend(info)
driver.close()
driver.switch_to.window(driver.window_handles[0])
with open("Spredsheet.txt", "w") as output:
output.write(','.join(Data))
driver.close()
Test.py
Displaying Test.py.
Try the below approach using requests and beautifulsoup. In the below script i have used the API URL fetched from website itself for ex:-API URL
First it will create the URL(refer first url) for first iteration, add headers and data in .csv file.
Second iteration it will again create the URL(refer second url) with 2 extra params start_on_page=20 & show_per_page=20 where start_on_page number 20 is incremented by 20 on each iteration and show_per_page = 100 defaulted to extract 100 records per iteration so on till all the data dumped in to the .csv file.second iteration API URL
Script is dumping 4 things number, name, location and profile url.
On each iteration data will be appended to .csv file , so your memory issue will get resolved by this approach.
Do not forget to add your system path in file_path variable where do you want to create .csv file before running the script.
import requests
from urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
from bs4 import BeautifulSoup as bs
import csv
def scrap_directory_data():
list_of_credentials = []
file_path = ''
file_name = 'credential_list.csv'
count = 0
page_number = 0
page_size = 100
create_url = ''
main_url = 'https://directory.bcsp.org/search_results.php?'
first_iteration_url = 'first_name=&last_name=&city=&state=&country=&certification=&unauthorized=0&retired=0&specialties=&industries='
number_of_records = 0
csv_headers = ['#','Name','Location','Profile URL']
while True:
if count == 0:
create_url = main_url + first_iteration_url
print('-' * 100)
print('1 iteration URL created: ' + create_url)
print('-' * 100)
else:
create_url = main_url + 'start_on_page=' + str(page_number) + '&show_per_page=' + str(page_size) + '&' + first_iteration_url
print('-' * 100)
print('Other then first iteration URL created: ' + create_url)
print('-' * 100)
page = requests.get(create_url,verify=False)
extracted_text = bs(page.text, 'lxml')
result = extracted_text.find_all('tr')
if len(result) > 0:
for idx, data in enumerate(result):
if idx > 0:
number_of_records +=1
name = data.contents[1].text
location = data.contents[3].text
profile_url = data.contents[5].contents[0].attrs['href']
list_of_credentials.append({
'#':number_of_records,
'Name':name,
'Location': location,
'Profile URL': profile_url
})
print(data)
with open(file_path + file_name ,'a+') as cred_CSV:
csvwriter = csv.DictWriter(cred_CSV, delimiter=',',lineterminator='\n',fieldnames=csv_headers)
if idx == 0 and count == 0:
print('Writing CSV header now...')
csvwriter.writeheader()
else:
for item in list_of_credentials:
print('Writing data rows now..')
print(item)
csvwriter.writerow(item)
list_of_credentials = []
count +=1
page_number +=20
scrap_directory_data()
I am trying to create a python-selenium project which checks whether the people in my whatsapp chat list are online or offline. Basically it bruteforces one by one to check whether someone is online or not and then it saves the data in a excel file. Also it gives a green background to the people who are online..
here is my code:
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from openpyxl import Workbook
from openpyxl.styles import PatternFill
import time
workbook = Workbook()
sheet = workbook.active
browser = webdriver.Chrome(executable_path=r"F:\software\chromedriver_win32\chromedriver.exe")
browser.get('https://web.whatsapp.com/')
print("Loading..\n")
for x in range(5,0,-1):
print(x)
time.sleep(1)
#the below function checks whether the 'online' element exists or not
#I got the class name by inspecting the WhatsappWeb page
def check_exists_by_xpath():
try:
browser.find_element_by_xpath('//span[#class="O90ur _3FXB1"]')
except NoSuchElementException:
return False
return True
count = 1
#the xpath gets the name of the persons on my chatlist
for iterator in browser.find_elements_by_xpath('//div[#class="_2wP_Y"]'):
iterator.click()
cellA = "A" + str(count)
cellB = "B" + str(count)
time.sleep(2)
name = browser.find_element_by_xpath('//div[#class="_3XrHh"]/span').text
if check_exists_by_xpath() == True:
sheet[cellA] = name
sheet[cellB] = " isOnline\n"
sheet[cellA].fill = PatternFill(start_color="a4d968", end_color="a4d968", fill_type = "solid")
sheet[cellB].fill = PatternFill(start_color="a4d968", end_color="a4d968", fill_type = "solid")
if check_exists_by_xpath() == False:
sheet[cellA] = name
sheet[cellB] = " isOffline\n"
count = count + 1
workbook.save(filename="WhatsApp_Data.xlsx")
print("Complete..!")
browser.close()
But I can't understand, why the code stops after collecting data of 18 people? Also can anyone find a better technique to achieve this, other than bruteforcing..
Actually the code just clicks on the names of the people in WhatsappWeb list and if the element which display the online message (beneath the name) - exists then returns true or else false..
I'm trying to get the time it takes a page to fully load (like the Finish in Google's Chrome Dev Tools). I wrote something in Python but I get really low time results, way less than a second which is not realistic. This is what I have:
from urllib.request import urlopen
from time import time
class Webpage:
def __init__(self, pageName, pageUrl):
self.pageName = pageName
self.pageUrl = pageUrl
class LoadingDetail:
def __init__(self, webPage, loadingTimes): #Getting the webpage object, and it's loading times
self.webPage = webPage
self.loadingTimes = loadingTimes
pages = [
Webpage("test", "URL"),
Webpage("test2", "URL"),
Webpage("test3", "URL"),
Webpage("test4", "URL"),
]
loadingDeatils = []
for page in pages: #Going through each page in the array.
pageLoadTimes = [] #Storing the time it took the page to load.
for x in range(0, 3): #Number of times we request each page.
stream = urlopen(page.pageUrl)
startTime = time()
streamRead = stream.read()
endTime = time()
stream.close()
timeToLoad = endTime - startTime #The time it took to read the whole page.
pageLoadTimes.append(timeToLoad)
loadDetails = LoadingDetail(page, pageLoadTimes)
loadingDeatils.append(loadDetails)
I get results like 0.00011. I searched but only found Selenium based answers which I can't use.
Is there an option to do it which Python only? is Python the right tool for this? I saw answers with JS that seems to be exactly what I was looking for.
I tried it and was able to measure the time with the below code, this might help
reference: geeksforgeeks.org/timeit-python-examples/
import timeit
mysetup = "from urllib.request import urlopen"
mycode = '''
urlopen('http://www.python.org')
'''
print(timeit.timeit(setup = mysetup, stmt = mycode, number = 1))
I am working on a script to scrape a website, the problem is that it works normally when I run it with the interpreter, however after compiling it (PyInstaller or Py2exe) it fails, it appears to be that mechanize / requests both fail to keep the session alive.
I have hidden my username and password here, but I did put them correctly in the compiled code
import requests
from bs4 import BeautifulSoup as bs
from sys import argv
import re
import logging
url = argv[1]
payload = {"userName": "real_username", "password": "realpassword"}
session = requests.session()
resp = session.post("http://website.net/login.do", data=payload)
if "forgot" in resp.content:
logging.error("Login failed")
exit()
resp = session.get(url)
soup = bs(resp.content)
urlM = url[:url.find("?") + 1] + "page=(PLACEHOLDER)&" + \
url[url.find("?") + 1:]
# Get number of pages
regex = re.compile("\|.*\|\sof\s(\d+)")
script = str(soup.findAll("script")[1])
epNum = int(re.findall(regex, script)[0]) # Number of EPs
pagesNum = epNum // 50
links = []
# Get list of links
# If number of EPs > 50, more than one page
if pagesNum == 0:
links = [url]
else:
for i in range(1, pagesNum + 2):
url = urlM.replace("(PLACEHOLDER)", str(i))
links.append(url)
# Loop over the links and extract info: ID, NAME, START_DATE, END_DATE
raw_info = []
for pos, link in enumerate(links):
print "Processing page %d" % (pos + 1)
sp = bs(session.get(link).content)
table = sp.table.table
raw_info.extend(table.findAll("td"))
epURL = "http://www.website.net/exchange/viewep.do?operation"\
"=executeAction&epId="
# Final data extraction
raw_info = map(str, raw_info)
ids = [re.findall("\d+", i)[0] for i in raw_info[::4]]
names = [re.findall("<td>(.*)</td", i)[0] for i in raw_info[1::4]]
start_dates = [re.findall("<td>(.*)</td", i)[0] for i in raw_info[2::4]]
end_dates = [re.findall("<td>(.*)</td", i)[0] for i in raw_info[3::4]]
emails = []
eplinks = [epURL + str(i) for i in ids]
print names
The error happens on the level of epNum variable, this means as I figured that the HTML page is not the one I requested, but it works normally on linux script and compiled, work on widows as script but fails when compiled.
The py2exe tutorial mentions that you need MSVCR90.dll, did you check its present on the PC?