Is it possible to Download a csv file using selenium python and then deleting it or just download the file temporary only?
This is the code i am using to download the csv file
fp = webdriver.FirefoxProfile()
fp.set_preference("browser.download.folderList",2)
fp.set_preference("browser.download.manager.showWhenStarting",False)
fp.set_preference("browser.download.dir", os.getcwd())
fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/csv,text/csv,application/pdfss, text/csv, application/excel")
fp.set_preference("pdfjs.disabled", True)
browser = webdriver.Firefox(firefox_profile=fp)
You can mention the path to your file and use os.remove() to remove/delete the file.
EDIT: If you wish to fetch the name of the file which you have downloaded (I don't think selenium has added this functionality yet) you can try checking the difference in the directory listing before and after downloading the file by using os.listdir().
import os
before = os.listdir('/home/jason/Downloads')
# Download the file using Selenium here
after = os.listdir('/home/jason/Downloads')
change = set(after) - set(before)
if len(change) == 1:
file_name = change.pop() #file name stored as string
else:
print "More than one file or no file downloaded"
I have added one line of code and the following solution works
import os
before = os.listdir('/home/jason/Downloads')
# Download the file using Selenium here
after = os.listdir('/home/jason/Downloads')
change = set(after) - set(before)
if len(change) == 1:
file_name = change.pop() #file name stored as string
os.remove(file_name)
else:
print "More than one file or no file downloaded"
Related
I am re-framing an existing question for simplicity. I have the following code to download Excel files from a company Share Point site.
import requests
import pandas as pd
def download_file(url):
filename = url.split('/')[-1]
r = requests.get(url)
with open(filename, 'wb') as output_file:
output_file.write(r.content)
df = pd.read_excel(r'O:\Procurement Planning\QA\VSAF_test_macro.xlsm')
df['Name'] = 'share_point_file_path_documentName' #i'm appending the sp file path to the document name
file = df['Name'] #I only need the file path column, I don't need the rest of the dataframe
# for loop for download
for url in file:
download_file(url)
The downloads happen and I don't get any errors in Python, however when I try to open them I get an error from Excel saying Excel cannot open the file because the file format or extension is not valid. If I print the link in Jupyter Notebooks it does open correctly, the issue appears to be with the download.
Check r.status_code. This must be 200 or you have the wrong url or no permission.
Open the downloaded file in a text editor. It might be a HTML file (Office Online)
If the URL contains a web=1 query parameter, remove it or replace it by web=0.
I am writing small python code to download a file from follow link and retrieve original filename
and its extension.But I have come across one such follow link for which python downloads the file but it is without any extension whereas file has .txt extension when downloads using browser.
Below is the code I am trying :
from urllib.request import urlopen
from urllib.parse import unquote
import wget
filePath = 'D:\\folder_path'
followLink = 'http://example.com/Reports/Download/c4feb46c-8758-4266-bec6-12358'
response = urlopen(followLink)
if response.code == 200:
print('Follow Link(response url) :' + response.url)
print('\n')
unquote_url = unquote(response.url)
file_name = wget.detect_filename(response.url).replace('|', '_')
print('file_name - '+file_name)
wget.download(response.url,filePa
th)
file_name variable in above code is just giving 'c4feb46c-8758-4266-bec6-12358' as filename.
Where I want to download it as c4feb46c-8758-4266-bec6-12358.txt.
I have also tried to read file name from header i.e. response.info(). But not getting proper file name.
Anyone can please help me with this.I am stucked in my work.Thanks in advance.
Wget gets the filename from the URL itself. For example, if your URL was https://someurl.com/filename.pdf, it is saved as filename.pdf. If it was https://someurl.com/filename, it is saved as filename. Since wget.download returns the filename of the downloaded file, you can rename it to any extension you want with os.rename(filename, filename+'.<extension>').
I try converting multiple html file into pdf using pdfkik. This is my code:
from bs4 import BeautifulSoup
from selenium import webdriver
import pdfkit
driver=webdriver.Chrome()
driver.get('https://www.linkedin.com/in/jaypratappandey/')
time.sleep(40)
soup= BeautifulSoup(driver.page_source, 'lxml')
data=[]
f=open('htmlfile.html', 'w')
top=open('tophtmlfile.html', 'w')
for name in soup.select('.pv-top-card-section__body'):
top.write("%s" % name)
for item in soup.select('.pv-oc.ember-view'):
f.write("%s" % item)
pdfkit.from_file(['tophtmlfile.html', 'htmlfile.html'], 'jayprofile.pdf')
driver.quit()
This code give the following error:
Traceback (most recent call last):
File "lkdndata.py", line 23, in <module>
pdfkit.from_file(['tophtmlfile.html', 'htmlfile.html'], 'ankurprofile.pdf')
File "/usr/local/lib/python3.5/dist-packages/pdfkit/api.py", line 49, in from_file
return r.to_pdf(output_path)
File "/usr/local/lib/python3.5/dist-packages/pdfkit/pdfkit.py", line 156, in to_pdf
raise IOError('wkhtmltopdf reported an error:\n' + stderr)
OSError: wkhtmltopdf reported an error:
Error: This version of wkhtmltopdf is build against an unpatched version of QT, and does not support more then one input document.
Exit with code 1, due to unknown error.
The solution i found was to first merge the html files into one and then go on to convert it using pdfkit. so in your case would be to save the tophtml and html files together in same dir and replace the path to that dir.
import pdfkit
import os
# path to folder containing html files
path = "/home/ec2-user/data-science-processes/src/results/"
def multiple_html_to_pdf(path):
""" converts multiple html files to a single pdf
args: path to directory containing html files
"""
empty_html = '<html><head></head><body></body></html>'
for file in os.listdir(path):
if file.endswith(".html"):
print(file)
# append html files
with open(path + file, 'r') as f:
html = f.read()
empty_html = empty_html.replace('</body></html>', html + '</body></html>')
# save merged html
with open('merged.html', 'w') as f:
f.write(empty_html)
pdfkit.from_file('/home/ec2-user/data-science-processes/report/merged.html','Report.pdf')
multiple_html_to_pdf(path)
I had the same error. The error you are probably getting is due to the inconsistency of your qt installation and non availability of compatible qt version.
Try running
wkhtmltopdf
on your terminal and see whether you can find "Reduced Functionality".
If yes then my assumption is correct and then your safest bet would be to compile it from source.
I'm not sure if this can be done or not but I thought I'd ask!
I'm running a windows 10 PC using Python 2.7. I'm wanting to download a file form sharepoint to a folder on my C: drive.
OpenPath = "https://office.test.com/sites/Rollers/New improved files/"
OpenFile = "ABC UK.xlsb"
The downside is the file is uploaded externally & due to human error it can be saved as a .xlsx or ABC_UK. Therefor I only want to use the first 3 characters with a wildcard (ABC*) to open that file. Thankfully the first 3 Characters are unique & there only be one file in the path that should match.
to find the file in your dir:
import os, requests, fnmatch
#loop over dir
for filename in os.listdir(OpenPath):
#find the file
if fnmatch.fnmatch(filename, 'ABC_UK*'):
#download the file
# open file handler
with open('C:\dwnlfile.xls', 'wb') as fh:
#try to get it
result = requests.get(OpenPath+filename)
#check u got it
if not result.ok:
print result.reason # or text
exit(1)
#save it
fh.write(result.content)
print 'got it and saved'
I was wondering if somebody here could help me out creating a script? I have never done something like this before so I have no idea what I’m doing. But I have been reading about it for a couple days now and I’m still not understanding it so I appreciating all help I can get. I’m even willing to pay for your service!
Here is an example of my problem. I have for the moment a CSV file named “Stars” saved on my windows desktop containing around 50.000 different links that directly starts downloading a xls file when pressed. Each row contains one of these links. I would want with your help create some kind of script for this that will make some kind of loop thru each row and visit this different links so it can download these 50.000 different files.
Thank you all for taking time to read this
/ Sarah
Say your CSV file looks like:
http://www.ietf.org/rfc/rfc959.txt
http://www.ietf.org/rfc/rfc1579.txt
http://www.ietf.org/rfc/rfc2577.txt
replace path to csvfile and targetdir in python code:
import os
import urllib2
csvfile = '/tmp/links.csv'
targetdir = '/tmp/so'
with open(csvfile) as links:
for link in links:
filename = link.split('/')[-1].strip()
filepath = os.path.join(targetdir, filename)
print 'Downloading %s \n\t .. to %s' % (link.strip(), filepath)
with open(filepath, 'w') as data:
xlsfile = urllib2.urlopen(link)
data.writelines(xlsfile)
Example of usage:
$ python download_all.py
Downloading http://www.ietf.org/rfc/rfc959.txt
.. to /tmp/so/rfc959.txt
Downloading http://www.ietf.org/rfc/rfc1579.txt
.. to /tmp/so/rfc1579.txt
Downloading http://www.ietf.org/rfc/rfc2577.txt
.. to /tmp/so/rfc2577.txt
$ dir -1 /tmp/so
rfc1579.txt
rfc2577.txt
rfc959.txt
Good Luck.
Another Solution:
Without more information, the best answer I can give you on this question would be to use Selenium to download the file and the csv module to parse your csv with the links.
Example:
import csv
from selenium import webdriver
profile = webdriver.FirefoxProfile()
profile.set_preference('browser.download.folderList', 2)
profile.set_preference('browser.download.manager.showWhenStarting', False)
profile.set_preference('browser.download.dir', 'PATH\TO\DOWNLOAD\DIRECTORY')
profile.set_preference('browser.helperApps.neverAsk.saveToDisk', "application/csv")
driver = webdriver.Firefox(firefox_profile=profile)
input_csv_location = "PATH\TO\CSV.csv"
with open(csv_location, 'r') as input_csv:
reader = csv.reader(input_csv)
for line in reader:
driver.get(line[0])
This assumes there is no header on the csv and that the urls are sitting in spot numero uno.