My following code:
#!/usr/bin/env python
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
# Define firefox profile
download_dir = "/Users/pdubois/Desktop/TargetMine_Output/"
fp = webdriver.FirefoxProfile()
fp.set_preference("browser.download.folderList",2)
fp.set_preference("browser.download.manager.showWhenStarting",False)
fp.set_preference("browser.download.dir", download_dir)
#fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "text")
fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
driver = webdriver.Firefox(fp)
driver.implicitly_wait(20)
genes = "Eif2ak2,Pcgf5,Vps25,Prmt6,Tcerg1,Dnttip2,Preb,Polr1b,Gabpb1,Prdm1,Fosl2,Zfp143,Psip1,Kat6a,Tgif1,Txn1,Irf8,Cnot6l,Zfp451,Foxk2,Lpxn,Etv6,Khsrp,Lmo4,Nkrf,Mafk,Mbd1,Cited2,Elp5,Jdp2,Bzw1,Rbm15b,Klf9,Gtf2e2,Dynll1,Klf6,Stat1,Srrt,Gtf2f1,Adnp2,Ikbkg,Mybbp1a,Nup62,Brd2,Chd1,Kctd1,Sap30,Cebpd,Mtf1,Gtf2h2,Fubp1,Tcea1,Irf2bp2,Ezh2,Hnrpdl,Pml,Cebpz,Med7"
targetmine_url = "http://targetmine.nibio.go.jp/targetmine/begin.do"
driver.get(targetmine_url)
# Define type of list to be submitted
gene_select = Select(driver.find_element_by_name("type"))
gene_select.select_by_visible_text(u"Gene")
# Enter list and submit
gene_input = driver.find_element_by_id("listInput")
gene_input.send_keys(genes)
submit = driver.find_element_by_css_selector("input.button.light").click()
# Choose name for list
driver.find_element_by_id("newBagName").clear()
driver.find_element_by_id("newBagName").send_keys("ADX.06.ID.Clust1")
driver.switch_to_frame("__pomme::0")
# Add All
driver.find_element_by_css_selector("span.small.success.add-all.button").click()
# Save all genes
driver.find_element_by_css_selector("a.success.button.save").click()
# Select M. Musculus
driver.find_element_by_xpath("//ul[#id='customConverter']/li[2]/a[1]").click()
# Gene enrchment part
go_xpath = "//div[#id='gene_go_enrichment-widget']/div[#class='inner']/div[1]/div[#class='form']/form[#style='margin:0']/div[2]/select[1]"
#driver.find_element_by_xpath(go_xpath).click()
go_select = Select(driver.find_element_by_xpath(go_xpath))
go_select.select_by_visible_text(u"1.00")
# Download
#driver.find_element_by_css_selector("a.btn.btn-small.export").click()
Works fine. Which end with this instances:
One last thing I want to achieve then is to Save the file automatically.
Despite I've already set the Firefox profile at the top
of the code, it doesn't do as I hoped. What's the right way to do it?
Update:
The solution by alecxe works.
Except I tried this, it doesn't save the file.
go_download_xpath = "//div[#id='gene_go_enrichment-widget']/div[#class='inner']/div[1]/div[2]/a[#class='btn btn-small export']"
driver.find_element_by_xpath(go_download_xpath).click()
# it saved the specific desired file.
# using
#driver.find_element_by_css_selector("a.btn.btn-small.export").click()
#save the wrong file.
This particular dialog cannot be controlled via selenium - this is a browser popup, not a javascript popup (which can be automated with swith_to.alert).
In this case, you need to avoid the popup being shown in the first place and make Firefox download the file automatically by tweaking browser's desired capabilities (aka profile preferences). Firefox can download files automatically depending on the mime-type of the file being downloaded. In your case it is text/plain:
fp = webdriver.FirefoxProfile()
fp.set_preference("browser.download.folderList",2)
fp.set_preference("browser.download.dir", download_dir)
fp.set_preference("browser.download.manager.showWhenStarting", False)
fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
driver = webdriver.Firefox(firefox_profile=fp)
FYI, I've downloaded the file manually and used magic module to detect the mime-type:
In [1]: import magic
In [2]: mime = magic.Magic(mime=True)
In [3]: mime.from_file("result.tsv")
Out[3]: 'text/plain'
Try:
fp = webdriver.FirefoxProfile()
fp.set_preference("browser.download.dir", download_dir)
fp.set_preference("browser.preferences.instantApply", True)
fp.set_preference("browser.helperApps.neverAsk.saveToDisk",
"text/plain, application/octet-stream, application/binary, text/csv, application/csv, application/excel, text/comma-separated-values, text/xml, application/xml")
fp.set_preference("browser.helperApps.alwaysAsk.force", False)
fp.set_preference("browser.download.manager.showWhenStarting", False)
fp.set_preference("browser.download.folderList", 2)
driver = webdriver.Firefox(firefox_profile=fp)
Related
I am new to web automation using selenium Firefox python environment.
Step 1:- I am hitting a webpage using driver.get. there by I am setting up my preference to download the file in the specific location.
Step2:- Providing data to Decal Number after finding driver.find_element_by_id("DecalNumber"). And click Begin search button.
Step3:- Now again click button by accessing through xpath.
Step4:- Now again click one more button by accessing through xpath. this will open a file in new window (https://www.tdlr.texas.gov/DownLoadableContent/Elevator/Images/1197897/26695455.tif) as an example.
My questions are all follows:-
Since the file is automatically getting saved in the preferenced location, and most of the time I get 404 error no data found and some of the time I get .tiff image data. how do I check whether it as data or 404 error????
If 404 error is not found, then need to rename the file that is being downloaded with its associated Decal Number using Shutil library. but the problem here is, if I have 404 error then no new file is getting downloaded. But shutil.move() is tryin to rename the file which is already downloaded for other decal number. Shutil is trying to change the file name wit current decal number, which should not supposed to do. how to stop this happening?
Any lead are welcome.
from selenium import webdriver
# create webdriver object
fp = webdriver.FirefoxProfile()
fp.set_preference('browser.download.dir', r'C:\Users\Swaminathan Sekar\Documents\Decal_Final')
fp.set_preference('browser.download.folderList', 2)
fp.set_preference('browser.helperApps.neverAsk.saveToDisk', "image/tiff")
fp.set_preference('browser.download.manager.showWhenStarting', False)
fp.set_preference('browser.helperApps.neverAsk.openFile', "image/tiff")
fp.set_preference('browser.helperApps.alwaysAsk.force', False)
fp.set_preference('browser.download.manager.useWindow', False)
fp.set_preference('browser.download.manager.focusWhenStarting', False)
fp.set_preference('browser.download.manager.alertOnEXEOpen', False)
fp.set_preference('browser.download.manager.showAlertOnComplete', False)
fp.set_preference('browser.download.manager.closeWhenDone', False)
fp.set_preference("pdfjs.disabled", True)
driver = webdriver.Firefox(fp)
driver.maximize_window()
driver.implicitly_wait(20)
driver.get("https://www.tdlr.texas.gov/Elevator_SearchApp/Elevator/Search")
window_before = driver.current_window_handle
Decal_test=Decal_Number[75:76]
for i in range(0,len(Decal_test)):
inputElement1 = driver.find_element_by_id("DecalNumber")
inputElement1.clear()
inputElement1.send_keys(Decal_test[i])
driver.find_elements_by_xpath("/html/body/div[1]/div[2]/div[3]/div/div[3]/div/form/div[6]/div/div[2]/button")[0].click()
driver.find_elements_by_xpath("/html/body/div[1]/div[2]/div[3]/div/div[3]/div[2]/div/table/tbody/tr/td[3]/a[1]")[0].click()
driver.find_elements_by_xpath("/html/body/div[1]/div[2]/div[2]/div/div[2]/div[1]/div[1]/table/tbody/tr[6]/td/button")[0].click()
window_after = driver.window_handles[1]
driver.switch_to.window(window_after)
driver.implicitly_wait(30)
driver.refresh()
driver.find_element_by_xpath("//td[text()='Subsequent Inspection']/preceding-sibling::td[1]/a").click()
wait = WebDriverWait(driver, 10)
wait.until(EC.text_to_be_present_in_element(".tiff"))
print(driver.current_url)
time.sleep(20)
driver.switch_to.window(window_before)
driver.back()
driver.back()
Initial_path= r'C:\Users\Swaminathan Sekar\Documents\Decal_Final'
filename = max([Initial_path + "\\" + f for f in os.listdir(Initial_path)],key=os.path.getctime,default=0)
if filename !=0:
shutil.move(filename,os.path.join(Initial_path, str(i)+'_'+str(Decal_test[i]) + ".tiff"))
print('Elasped time =',time.process_time(),'s')
Example: https://apps1.lavote.net/camp/comm.cfm?&cid=82
With Selenium, I am clicking the first Form 497. In my browser, a new tab of the pdf opens. In selenium, nothing seems to happen.
Here is my code, with some parts redacted.
def scrape(session_key=None):
options = Options()
options.headless = True
profile = webdriver.FirefoxProfile()
profile.set_preference("browser.download.dir", os.path.join(base_dir, 'reports'))
profile.set_preference("browser.download.folderList", 2)
profile.set_preference("browser.helperApps.alwaysAsk.force", False);
profile.set_preference("browser.download.manager.showAlertOnComplete", False)
profile.set_preference("browser.download.manager.showWhenStarting", False);
profile.set_preference('browser.helperApps.neverAsk.saveToDisk','application/zip,application/octet-stream,application/x-zip-compressed,multipart/x-zip,application/x-rar-compressed, application/octet-stream,application/msword,application/vnd.ms-word.document.macroEnabled.12,application/vnd.openxmlformats-officedocument.wordprocessingml.document,application/vnd.ms-excel,application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,application/vnd.openxmlformats-officedocument.wordprocessingml.document,application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,application/rtf,application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,application/vnd.ms-excel,application/vnd.ms-word.document.macroEnabled.12,application/vnd.openxmlformats-officedocument.wordprocessingml.document,application/xls,application/msword,text/csv,application/vnd.ms-excel.sheet.binary.macroEnabled.12,text/plain,text/csv/xls/xlsb,application/csv,application/download,application/vnd.openxmlformats-officedocument.presentationml.presentation,application/octet-stream')
profile.set_preference("pdfjs.disabled", True)
profile.set_preference("plugin.disable_full_page_plugin_for_types", "application/pdf")
driver = webdriver.Firefox(firefox_profile=profile, options=options)
driver.get(magic_url)
committee_table = driver.find_elements_by_css_selector('table')[2]
links = [link.get_attribute('href') for link in committee_table.find_elements_by_tag_name('a')]
driver.get('https://apps1.lavote.net/camp/comm.cfm?&cid=82')
forms_table = driver.find_elements_by_css_selector('table')[1]
forms_table_trs = forms_table.find_elements_by_css_selector('tr')
for i, row in enumerate(forms_table_trs):
if i > 0:
cells = row.find_elements_by_css_selector('td')
print(1)
try:
link = cells[2].find_elements_by_tag_name('a')[0]
link.click()
pdfs = glob.glob(os.path.join(base_dir, 'scraper/*.pdf'))
latest_pdf_file = max(pdfs, key=os.path.getctime)
parse_funcs[form_type](latest_pdf_file)
except Exception as e:
print(e)
As you may have guessed, there are no pdfs. They are not downloaded. That's why I'm here. How can I do this?
If you only need the files and not to test the actual browser dialogue routine, grab the files using Python instead of asking Selenium to do that.
Grab the PDF URLs from the page, then use request to download the file to your memory and then open().write() to save it to the file system.
req = requests.get(url, allow_redirects=True)
open(filename, 'wb').write(r.content)
You can also get the filename from r, but it's a bit bothersome. Check it here: https://www.codementor.io/#aviaryan/downloading-files-from-urls-in-python-77q3bs0un
I need to download a set of individual pdf files from a webpage. It is publicly available by government (ministry of education in Turkey) so totally legal.
However my selenium browser only displays the pdf file, how can I download it and name as I wish.
(This code is also from web)
# Import your newly installed selenium package
from selenium import webdriver
from bs4 import BeautifulSoup
# Now create an 'instance' of your driver
# This path should be to wherever you downloaded the driver
driver = webdriver.Chrome(executable_path="/Users/ugur/Downloads/chromedriver")
# A new Chrome (or other browser) window should open up
download_dir = "/Users/ugur/Downloads/" # for linux/*nix, download_dir="/usr/Public"
options = webdriver.ChromeOptions()
profile = {"plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}], # Disable Chrome's PDF Viewer
"download.default_directory": download_dir , "download.extensions_to_open": "applications/pdf"}
options.add_experimental_option("prefs", profile)
# Now just tell it wherever you want it to go
driver.get("https://odsgm.meb.gov.tr/kurslar/KazanimTestleri.aspx?sinifid=5&ders=29")
driver.find_element_by_id("ContentPlaceHolder1_dtYillikPlanlar_lnkIndir_2").click()
driver.get("https://odsgm.meb.gov.tr/kurslar/PDFFile.aspx?name=kazanimtestleri.pdf")
Thanks in advance
Extra information:
I had a python 2 code doing this perfectly. But somehow it creates empty files and I couldn't convert it to python 3. Maybe this helps (no offense but I never liked selenium)
import urllib
import urllib2
from bs4 import BeautifulSoup
import os
sinifId=5
maxOrd = 1
fileNames=[]
directory = '/Users/ugur/Downloads/Hasan'
print 'List of current files in directory '+ directory+'\n---------------------------------\n\n'
for current_file in os.listdir(directory):
if (current_file.find('pdf')>-1 and current_file.find(' ')>-1):
print current_file
order = int(current_file.split(' ',1)[0])
if order>maxOrd: maxOrd=order
fileNames.append(current_file.split(' ',2)[1])
print '\n\nStarting download \n---------------------------------\n'
ctA=int(maxOrd+1)
for ders in [29]:
urlSinif='http://odsgm.meb.gov.tr/kurslar/KazanimTestleri.aspx?sinifid='+str(sinifId)+'&ders='+str(ders)
page = urllib2.urlopen(urlSinif)
soup = BeautifulSoup(page,"lxml")
st = soup.prettify()
count=st.count('ctl00')-1
dersAdi = soup.find('a', href='/kurslar/CevapAnahtarlari.aspx?sinifid='+str(sinifId)+'&ders='+str(ders)).getText().strip()
for testNo in range(count):
if(str(sinifId)+str(ders)+str(testNo+1) in fileNames):
print str(ctA)+' '+str(sinifId)+str(ders)+str(testNo+1)+' '+dersAdi+str(testNo+1)+'.pdf'+' skipped'
else:
annex=""
if(testNo%2==1): annex="2"
eiha_url = u'http://odsgm.meb.gov.tr/kurslar/KazanimTestleri.aspx?sinifid='+str(sinifId)+'&ders='+str(ders)
data = ('__EVENTTARGET','ctl00$ContentPlaceHolder1$dtYillikPlanlar$ctl'+format(testNo, '02')+'$lnkIndir'+annex), ('__EVENTARGUMENT', '39')
print 'ctl00$ContentPlaceHolder1$dtYillikPlanlar$ctl'+format(testNo, '02')+'$lnkIndir'+annex
new_data = urllib.urlencode(data)
response = urllib2.urlopen(eiha_url, new_data)
urllib.urlretrieve (str(response.url), directory+'/{0:0>3}'.format(ctA)+' '+str(sinifId)+str(ders)+str(testNo+1)+' '+dersAdi+str(testNo+1)+'.pdf')
print str(ctA)+' '+str(sinifId)+str(ders)+str(testNo+1)+' '+dersAdi+str(testNo+1)+'.pdf'+' downloaded'
ctA=ctA+1
Add your options before launching Chrome and then specify the chrome_options parameter.
download_dir = "/Users/ugur/Downloads/"
options = webdriver.ChromeOptions()
profile = {"plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}],
"download.default_directory": download_dir,
"download.extensions_to_open": "applications/pdf"}
options.add_experimental_option("prefs", profile)
driver = webdriver.Chrome(
executable_path="/Users/ugur/Downloads/chromedriver",
chrome_options=options
)
To answer your second question:
May I ask how to specify the filename as well?
I found this: Selenium give file name when downloading
What I do is:
file_name = ''
while file_name.lower().endswith('.pdf') is False:
time.sleep(.25)
try:
file_name = max([download_dir + '/' + f for f in os.listdir(download_dir)], key=os.path.getctime)
except ValueError:
pass
Here is the code sample I used to download pdf with a specific file name. First you need to configure chrome webdriver with required options. Then after clicking the button (to open pdf popup window), call a function to wait for download to finish and rename the downloaded file.
import os
import time
import shutil
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
# function to wait for download to finish and then rename the latest downloaded file
def wait_for_download_and_rename(newFilename):
# function to wait for all chrome downloads to finish
def chrome_downloads(drv):
if not "chrome://downloads" in drv.current_url: # if 'chrome downloads' is not current tab
drv.execute_script("window.open('');") # open a new tab
drv.switch_to.window(driver.window_handles[1]) # switch to the new tab
drv.get("chrome://downloads/") # navigate to chrome downloads
return drv.execute_script("""
return document.querySelector('downloads-manager')
.shadowRoot.querySelector('#downloadsList')
.items.filter(e => e.state === 'COMPLETE')
.map(e => e.filePath || e.file_path || e.fileUrl || e.file_url);
""")
# wait for all the downloads to be completed
dld_file_paths = WebDriverWait(driver, 120, 1).until(chrome_downloads) # returns list of downloaded file paths
# Close the current tab (chrome downloads)
if "chrome://downloads" in driver.current_url:
driver.close()
# Switch back to original tab
driver.switch_to.window(driver.window_handles[0])
# get latest downloaded file name and path
dlFilename = dld_file_paths[0] # latest downloaded file from the list
# wait till downloaded file appears in download directory
time_to_wait = 20 # adjust timeout as per your needs
time_counter = 0
while not os.path.isfile(dlFilename):
time.sleep(1)
time_counter += 1
if time_counter > time_to_wait:
break
# rename the downloaded file
shutil.move(dlFilename, os.path.join(download_dir,newFilename))
return
# specify custom download directory
download_dir = r'c:\Downloads\pdf_reports'
# for configuring chrome pdf viewer for downloading pdf popup reports
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option('prefs', {
"download.default_directory": download_dir, # Set own Download path
"download.prompt_for_download": False, # Do not ask for download at runtime
"download.directory_upgrade": True, # Also needed to suppress download prompt
"plugins.plugins_disabled": ["Chrome PDF Viewer"], # Disable this plugin
"plugins.always_open_pdf_externally": True, # Enable this plugin
})
# get webdriver with options for configuring chrome pdf viewer
driver = webdriver.Chrome(options = chrome_options)
# open desired webpage
driver.get('https://mywebsite.com/mywebpage')
# click the button to open pdf popup
driver.find_element_by_id('someid').click()
# call the function to wait for download to finish and rename the downloaded file
wait_for_download_and_rename('My file.pdf')
# close the browser windows
driver.quit()
Set timeout (120) to the wait time as per your needs.
Non-selenium solution, You can do something like:
import requests
pdf_resp = requests.get("https://odsgm.meb.gov.tr/kurslar/PDFFile.aspx?name=kazanimtestleri.pdf")
with open("save.pdf", "wb") as f:
f.write(pdf_resp.content)
Although you might want to check the content type before to make sure it's a pdf
My purpose it to download a zip file from https://www.shareinvestor.com/prices/price_download_zip_file.zip?type=history_all&market=bursa
It is a link in this webpage https://www.shareinvestor.com/prices/price_download.html#/?type=price_download_all_stocks_bursa. Then save it into this directory "/home/vinvin/shKLSE/ (I am using pythonaywhere). Then unzip it and the csv file extract in the directory.
The code run until the end with no error but it does not downloaded.
The zip file is automatically downloaded when click on https://www.shareinvestor.com/prices/price_download_zip_file.zip?type=history_all&market=bursa manually.
My code with a working username and password is used. The real username and password is used so that it is easier to understand the problem.
#!/usr/bin/python
print "hello from python 2"
import urllib2
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from pyvirtualdisplay import Display
import requests, zipfile, os
display = Display(visible=0, size=(800, 600))
display.start()
profile = webdriver.FirefoxProfile()
profile.set_preference('browser.download.folderList', 2)
profile.set_preference('browser.download.manager.showWhenStarting', False)
profile.set_preference('browser.download.dir', "/home/vinvin/shKLSE/")
profile.set_preference('browser.helperApps.neverAsk.saveToDisk', '/zip')
for retry in range(5):
try:
browser = webdriver.Firefox(profile)
print "firefox"
break
except:
time.sleep(3)
time.sleep(1)
browser.get("https://www.shareinvestor.com/my")
time.sleep(10)
login_main = browser.find_element_by_xpath("//*[#href='/user/login.html']").click()
print browser.current_url
username = browser.find_element_by_id("sic_login_header_username")
password = browser.find_element_by_id("sic_login_header_password")
print "find id done"
username.send_keys("bkcollection")
password.send_keys("123456")
print "log in done"
login_attempt = browser.find_element_by_xpath("//*[#type='submit']")
login_attempt.submit()
browser.get("https://www.shareinvestor.com/prices/price_download.html#/?type=price_download_all_stocks_bursa")
print browser.current_url
time.sleep(20)
dl = browser.find_element_by_xpath("//*[#href='/prices/price_download_zip_file.zip?type=history_all&market=bursa']").click()
time.sleep(30)
browser.close()
browser.quit()
display.stop()
zip_ref = zipfile.ZipFile(/home/vinvin/sh/KLSE, 'r')
zip_ref.extractall(/home/vinvin/sh/KLSE)
zip_ref.close()
os.remove(zip_ref)
HTML snippet:
<li>All Historical Data <span>About 220 MB</span></li>
Note that & is shown when I copy the snippet. It was hidden from view source, so I guess it is written in JavaScript.
Observation I found
The directory home/vinvin/shKLSE do not created even I run the code with no error
I try to download a much smaller zip file which can be completed in a second but still do not download after a wait of 30s. dl = browser.find_element_by_xpath("//*[#href='/prices/price_download_zip_file.zip?type=history_daily&date=20170519&market=bursa']").click()
I don't see any major drawback in your code block as such. But here are a few recommendations through this Solution & the execution of this Automated Test Script:
This code works perfect in Off Market Hours. During Market Hours a lot of JavaScript & Ajax Calls are in play and handling those are beyond the scope of this Question.
You may consider checking for the the intended download directory first & if not available, create a new one. That code block for this functionality is in Windows style and works perfect on Windows platform.
Once you click on "Login" induce some wait for the HTML DOM to render properly.
When you want to see off the downloading process, you need to set certain more preferences in the FirefoxProfile as mentioned in my code below.
Always consider maximizing the browser window through browser.maximize_window()
When you start downloading you need to wait for sufficient amount of time to get the file completely downloaded.
If you are using browser.quit() at the end you don't need to use browser.close()
You may consider to replace all the time.sleep() with either of ImplicitlyWait or ExplicitWait or FluentWait.
Here is your own code block with some simple tweaks in it:
#!/usr/bin/python
print "hello from python 2"
import urllib2
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from pyvirtualdisplay import Display
import requests, zipfile, os
display = Display(visible=0, size=(800, 600))
display.start()
newpath = 'C:\\home\\vivvin\\shKLSE'
if not os.path.exists(newpath):
os.makedirs(newpath)
profile = webdriver.FirefoxProfile()
profile.set_preference("browser.download.dir",newpath);
profile.set_preference("browser.download.folderList",2);
profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/zip");
profile.set_preference("browser.download.manager.showWhenStarting",False);
profile.set_preference("browser.helperApps.neverAsk.openFile","application/zip");
profile.set_preference("browser.helperApps.alwaysAsk.force", False);
profile.set_preference("browser.download.manager.useWindow", False);
profile.set_preference("browser.download.manager.focusWhenStarting", False);
profile.set_preference("browser.helperApps.neverAsk.openFile", "");
profile.set_preference("browser.download.manager.alertOnEXEOpen", False);
profile.set_preference("browser.download.manager.showAlertOnComplete", False);
profile.set_preference("browser.download.manager.closeWhenDone", True);
profile.set_preference("pdfjs.disabled", True);
for retry in range(5):
try:
browser = webdriver.Firefox(profile)
print "firefox"
break
except:
time.sleep(3)
time.sleep(1)
browser.maximize_window()
browser.get("https://www.shareinvestor.com/my")
time.sleep(10)
login_main = browser.find_element_by_xpath("//*[#href='/user/login.html']").click()
time.sleep(10)
print browser.current_url
username = browser.find_element_by_id("sic_login_header_username")
password = browser.find_element_by_id("sic_login_header_password")
print "find id done"
username.send_keys("bkcollection")
password.send_keys("123456")
print "log in done"
login_attempt = browser.find_element_by_xpath("//*[#type='submit']")
login_attempt.submit()
browser.get("https://www.shareinvestor.com/prices/price_download.html#/?type=price_download_all_stocks_bursa")
print browser.current_url
time.sleep(20)
dl = browser.find_element_by_xpath("//*[#href='/prices/price_download_zip_file.zip?type=history_all&market=bursa']").click()
time.sleep(900)
browser.close()
browser.quit()
display.stop()
zip_ref = zipfile.ZipFile(/home/vinvin/sh/KLSE, 'r')
zip_ref.extractall(/home/vinvin/sh/KLSE)
zip_ref.close()
os.remove(zip_ref)
Let me know if this Answers your Question.
I rewrote your script, with comments explaining why I made the changes I made. I think your main problem might have been a bad mimetype, however, your script had a log of systemic issues that would have made it unreliable at best. This rewrite uses explicit waits, which completely removes the need to use time.sleep(), allowing it to run as fast as possible, while also eliminating errors that arise from network congestion.
You will need do the following to make sure all modules are installed:
pip install requests explicit selenium retry pyvirtualdisplay
The script:
#!/usr/bin/python
from __future__ import print_function # Makes your code portable
import os
import glob
import zipfile
from contextlib import contextmanager
import requests
from retry import retry
from explicit import waiter, XPATH, ID
from selenium import webdriver
from pyvirtualdisplay import Display
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
DOWNLOAD_DIR = "/tmp/shKLSE/"
def build_profile():
profile = webdriver.FirefoxProfile()
profile.set_preference('browser.download.folderList', 2)
profile.set_preference('browser.download.manager.showWhenStarting', False)
profile.set_preference('browser.download.dir', DOWNLOAD_DIR)
# I think your `/zip` mime type was incorrect. This works for me
profile.set_preference('browser.helperApps.neverAsk.saveToDisk',
'application/vnd.ms-excel,application/zip')
return profile
# Retry is an elegant way to retry the browser creation
# Though you should narrow the scope to whatever the actual exception is you are
# retrying on
#retry(Exception, tries=5, delay=3)
#contextmanager # This turns get_browser into a context manager
def get_browser():
# Use a context manager with Display, so it will be closed even if an
# exception is thrown
profile = build_profile()
with Display(visible=0, size=(800, 600)):
browser = webdriver.Firefox(profile)
print("firefox")
try:
yield browser
finally:
# Let a try/finally block manage closing the browser, even if an
# exception is called
browser.quit()
def main():
print("hello from python 2")
with get_browser() as browser:
browser.get("https://www.shareinvestor.com/my")
# Click the login button
# waiter is a helper function that makes it easy to use explicit waits
# with it you dont need to use time.sleep() calls at all
login_xpath = '//*/div[#class="sic_logIn-bg"]/a'
waiter.find_element(browser, login_xpath, XPATH).click()
print(browser.current_url)
# Log in
username = "bkcollection"
username_id = "sic_login_header_username"
password = "123456"
password_id = "sic_login_header_password"
waiter.find_write(browser, username_id, username, by=ID)
waiter.find_write(browser, password_id, password, by=ID, send_enter=True)
# Wait for login process to finish by locating an element only found
# after logging in, like the Logged In Nav
nav_id = 'sic_loggedInNav'
waiter.find_element(browser, nav_id, ID)
print("log in done")
# Load the target page
target_url = ("https://www.shareinvestor.com/prices/price_download.html#/?"
"type=price_download_all_stocks_bursa")
browser.get(target_url)
print(browser.current_url)
# CLick download button
all_data_xpath = ("//*[#href='/prices/price_download_zip_file.zip?"
"type=history_all&market=bursa']")
waiter.find_element(browser, all_data_xpath, XPATH).click()
# This is a bit challenging: You need to wait until the download is complete
# This file is 220 MB, it takes a while to complete. This method waits until
# there is at least one file in the dir, then waits until there are no
# filenames that end in `.part`
# Note that is is problematic if there is already a file in the target dir. I
# suggest looking into using the tempdir module to create a unique, temporary
# directory for downloading every time you run your script
print("Waiting for download to complete")
at_least_1 = lambda x: len(x("{0}/*.zip*".format(DOWNLOAD_DIR))) > 0
WebDriverWait(glob.glob, 300).until(at_least_1)
no_parts = lambda x: len(x("{0}/*.part".format(DOWNLOAD_DIR))) == 0
WebDriverWait(glob.glob, 300).until(no_parts)
print("Download Done")
# Now do whatever it is you need to do with the zip file
# zip_ref = zipfile.ZipFile(DOWNLOAD_DIR, 'r')
# zip_ref.extractall(DOWNLOAD_DIR)
# zip_ref.close()
# os.remove(zip_ref)
print("Done!")
if __name__ == "__main__":
main()
Full disclosure: I maintain the explicit module. It is designed to make using explicit waits much easier, for exactly situations like this, where websites slowly load in dynamic content based on user interactions. You could replace all of the waiter.XXX calls above with direct explicit waits.
Take it out side the scope of the selenium. Change the preference settings so that when the link is clicked (First check if link is valid) it gives you a pop up asking to save , now use sikuli http://www.sikuli.org/ to click on the popup.
Mime types does not always work, and there is no black and white answer why it is not working.
The reason is due to the webpage is loading slowly. I added a wait of 20 seconds after open the webpage link
login_attempt.submit()
browser.get("https://www.shareinvestor.com/prices/price_download.html#/?type=price_download_all_stocks_bursa")
print browser.current_url
time.sleep(20)
dl = browser.find_element_by_xpath("//*[#href='/prices/price_download_zip_file.zip?type=history_all&market=bursa']").click()
It returns no error.
Additional,
/zip is incorrect MIME type. Change to profile.set_preference('browser.helperApps.neverAsk.saveToDisk', 'application/zip')
The final correction :
#!/usr/bin/python
print "hello from python 2"
import urllib2
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from pyvirtualdisplay import Display
import requests, zipfile, os
display = Display(visible=0, size=(800, 600))
display.start()
profile = webdriver.FirefoxProfile()
profile.set_preference('browser.download.folderList', 2)
profile.set_preference('browser.download.manager.showWhenStarting', False)
profile.set_preference('browser.download.dir', "/home/vinvin/shKLSE/")
# application/zip not /zip
profile.set_preference('browser.helperApps.neverAsk.saveToDisk', 'application/zip')
for retry in range(5):
try:
browser = webdriver.Firefox(profile)
print "firefox"
break
except:
time.sleep(3)
time.sleep(1)
browser.get("https://www.shareinvestor.com/my")
time.sleep(10)
login_main = browser.find_element_by_xpath("//*[#href='/user/login.html']").click()
print browser.current_url
username = browser.find_element_by_id("sic_login_header_username")
password = browser.find_element_by_id("sic_login_header_password")
print "find id done"
username.send_keys("bkcollection")
password.send_keys("123456")
print "log in done"
login_attempt = browser.find_element_by_xpath("//*[#type='submit']")
login_attempt.submit()
browser.get("https://www.shareinvestor.com/prices/price_download.html#/?type=price_download_all_stocks_bursa")
print browser.current_url
time.sleep(20)
dl = browser.find_element_by_xpath("//*[#href='/prices/price_download_zip_file.zip?type=history_all&market=bursa']").click()
time.sleep(30)
browser.close()
browser.quit()
display.stop()
zip_ref = zipfile.ZipFile('/home/vinvin/shKLSE/file.zip', 'r')
zip_ref.extractall('/home/vinvin/shKLSE')
zip_ref.close()
# remove with correct path
os.remove('/home/vinvin/shKLSE/file.zip')
I haven't tried on the site you mentioned, however following code works perfectly and downloads the ZIP. if you are not able to download the zip, Mime type could be different. you can use chrome browser and network inspection to check the mime type of the file you are trying to download.
profile = webdriver.FirefoxProfile()
profile.set_preference('browser.download.folderList', 2)
profile.set_preference('browser.download.manager.showWhenStarting', False)
profile.set_preference('browser.download.dir', "/home/vinvin/shKLSE/")
profile.set_preference('browser.helperApps.neverAsk.saveToDisk', 'application/zip')
browser = webdriver.Firefox(profile)
browser.get("http://www.colorado.edu/conflict/peace/download/peace.zip")
This is a follow-up question to this previous question on how to download ~1000 files from Google Patents.
I would like to iterate through a list of filenames fname = ["ipg150106.zip", "ipg150113.zip"] and simulate clicking and saving these files to my computer. The following example works for me and downloads a single file:
from selenium import webdriver
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
# Define parameters
savepath = 'D:\\' # set the desired path here for the files
# Download the files from Google Patents
profile = FirefoxProfile ()
profile.set_preference("browser.download.panel.shown", False)
profile.set_preference("browser.download.folderList", 2) # 2 means specify custom location
profile.set_preference("browser.download.manager.showWhenStarting", False)
profile.set_preference("browser.download.dir", savepath) # choose folder to download to
profile.set_preference("browser.helperApps.neverAsk.saveToDisk",'application/octet-stream')
driver = webdriver.Firefox(firefox_profile=profile)
url = 'https://www.google.com/googlebooks/uspto-patents-grants-text.html#2015'
driver.get(url)
filename = driver.find_element_by_xpath('//a[contains(text(), "ipg150106.zip")]')
filename.click()
I've tried to replace this with a list and a loop like this:
fname = ["ipg150106.zip", "ipg150113.zip"]
for f in fname:
filename = driver.find_element_by_xpath('//a[contains(text(), f)]')
filename.click()
print('Finished loop for: {}.'.format(f))
However, the browser opens, but nothing happens (no clicking on files). Any ideas?
You need to pass the filename into the XPath expression:
filename = driver.find_element_by_xpath('//a[contains(text(), "{filename}")]'.format(filename=f))
Though, an easier location technique here would be "by partial link text":
for f in fname:
filename = driver.find_element_by_partial_link_text(f)
filename.click()