Problem opening up text file of the downloadPath with gedit - python

After downloading the dependencies from nexus, I have a download path for the data to be in, but I wasn't able to open the textile its not responding, why is this so?
for item in data["items"]:
for asset in item["assets"]:
fileurl = asset["downloadUrl"]
print(fileurl)
downloadPath = '/home/centos/'
filename = downloadPath + fileurl.split('/')[-1]# '\' for Windows
outfile = open(filename, "w")
outfile.write(str(urllib.request.urlopen(fileurl).read()))
outfile.close()
if data["continuationToken"] is None:
sys.exit()
else:
#construct pagination url and loop
url = baseurl + 'components?continuationToken=' + data["continuationToken"] + '&repository=' + downloadRepository
return

Related

How can I mock downloading file using request-mock

Good day, I need to test this function using request_mock:
def loader(link, output='os.getcwd'):
# Function loading a page from the link
response = requests.get(link)
data = response.text
file_name = modify_file_name(link)
if output == 'os.getcwd':
directory = os.getcwd()
else:
directory = output
filepath = os.path.join(directory, file_name + '.html')
with open(filepath, 'w') as page:
page.write(data)
return filepath
I didn't find a tutorial of instruction for beginners. I would be grateful for advice.

Install System Fonts in Powershell (or python)

At the beginning, I simplly use the COM 0x14 to install. And it works when I test it.
But when I use it in my program, it doesn't works.
links = ['https://p-f-t.com/cdn/CaskaydiaCove.zip', 'https://p-f-t.com/cdn/FiraCode.zip']
print('Downloading Nerd Font'.center(45,'=') + '\n')
def downFile(url):
resp = requests.get(url)
file_name = os.path.basename(url)
file = open(file_name, 'wb')
file.write(resp.content)
file.close()
print('Download', file_name, 'done!')
return file_name
font_zip_list = []
for link in links:
font_zip_list.append(downFile(link))
print('Installing Nerd Font'.center(45,'=') + '\n')
font_inst_script = '''$FONTS = 0x14
$objShell = New-Object -ComObject Shell.Application
$objFolder = $objShell.Namespace($FONTS)'''
def unzip_inst(zipfile_name):
zip_file = zipfile.ZipFile(zipfile_name, 'r')
font_list = []
for i in zip_file.namelist():
zip_file.extract(i, os.getcwd())
if i.rfind(r'/') != len(i) - 1:
print('Extracted', i, 'from', zipfile_name)
font_list.append(i)
return font_list
font_list = []
for z in font_zip_list:
font_list += unzip_inst(z)
print(font_list)
for f in font_list:
font_inst_script += '\n$objFolder.CopyHere("{font_path}")'.format(font_path='./' + f)
script = open('install.ps1', 'w')
script.write(font_inst_script)
script.close()
os.system('powershell.exe ./install.ps1')
print(''.center(45,'=') + '\n')
I need a simple way to install fonts families on python or powershell.

Having trouble into saving something to a csv file

My program does all that I want, but is not saving the final data to the csv file, I used a print before it to see if the data was right and it is, It is just not writing to the csv file, I'm using 'a' because I don't want it to rewrite what's already written, but it is still returning an error.
here's the part of the code:
soup = BeautifulSoup(answer)
for table in soup.findAll('table', {"class":"formTable"}):
for row in table.findAll('tr'):
#heading = row.find('td', {"class":"sectionHeading"})
#if heading is not None:
#print(heading.get_text());
#else:
label = row.find('td', {"class":"fieldLabel"})
data = row.find('td', {"class":"fieldData"})
if data is not None and label is not None:
csvline += label.get_text() + "," + data.get_text() + ","
print(csvline)
#csvline.encode('utf-8')
with open ('output_file_two.csv', 'a', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(csvline)
Here's the error:
Traceback (most recent call last):
File "C:\PROJECT\pdfs\final.py", line 95, in <module>
with open ('output_file_two.csv', 'a', encoding='utf-8') as f:
TypeError: 'encoding' is an invalid keyword argument for this function
Here's the entire program code in case of need
import shlex
import subprocess
import os
import platform
from bs4 import BeautifulSoup
import re
#import unicodecsv as csv
import csv
#import pickle
import requests
from robobrowser import RoboBrowser
import codecs
def rename_files():
file_list = os.listdir(r"C:\\PROJECT\\pdfs")
print(file_list)
saved_path = os.getcwd()
print('Current working directory is '+saved_path)
os.chdir(r'C:\\PROJECT\\pdfs')
for file_name in file_list:
os.rename(file_name, file_name.translate(None, " "))
os.chdir(saved_path)
rename_files()
def run(command):
if platform.system() != 'Windows':
args = shlex.split(command)
else:
args = command
s = subprocess.Popen(args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, errors = s.communicate()
return s.returncode == 0, output, errors
# Change this to your PDF file base directory
base_directory = 'C:\\PROJECT\\pdfs'
if not os.path.isdir(base_directory):
print "%s is not a directory" % base_directory
exit(1)
# Change this to your pdf2htmlEX executable location
bin_path = 'C:\\Python27\\pdfminer-20140328\\tools\\pdf2txt.py'
if not os.path.isfile(bin_path):
print "Could not find %s" % bin_path
exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
for file_name in file_name_list:
# If this is not a PDF file
if not file_name.endswith('.pdf'):
# Skip it
continue
file_path = os.path.join(dir_path, file_name)
# Convert your PDF to HTML here
args = (bin_path, file_name, file_path)
success, output, errors = run("python %s -o %s.html %s " %args)
if not success:
print "Could not convert %s to HTML" % file_path
print "%s" % errors
htmls_path = 'C:\\PROJECT'
with open ('score.csv', 'w') as f:
writer = csv.writer(f)
for dir_path, dir_name_list, file_name_list in os.walk(htmls_path):
for file_name in file_name_list:
if not file_name.endswith('.html'):
continue
with open(file_name) as markup:
soup = BeautifulSoup(markup.read())
text = soup.get_text()
match = re.findall("PA/(\S*)", text)#To remove the names that appear, just remove the last (\S*), to add them is just add the (\S*), before it there was a \s*
print(match)
writer.writerow(match)
for item in match:
data = item.split('/')
case_number = data[0]
case_year = data[1]
csvline = case_number + ","
browser = RoboBrowser()
browser.open('http://www.pa.org.mt/page.aspx?n=63C70E73&CaseType=PA')
form = browser.get_forms()[0] # Get the first form on the page
form['ctl00$PageContent$ContentControl$ctl00$txtCaseNo'].value = case_number
form['ctl00$PageContent$ContentControl$ctl00$txtCaseYear'].value = case_year
browser.submit_form(form, submit=form['ctl00$PageContent$ContentControl$ctl00$btnSubmit'])
# Use BeautifulSoup to parse this data
answer = browser.response.text
#print(answer)
soup = BeautifulSoup(answer)
for table in soup.findAll('table', {"class":"formTable"}):
for row in table.findAll('tr'):
#heading = row.find('td', {"class":"sectionHeading"})
#if heading is not None:
#print(heading.get_text());
#else:
label = row.find('td', {"class":"fieldLabel"})
data = row.find('td', {"class":"fieldData"})
if data is not None and label is not None:
csvline += label.get_text() + "," + data.get_text() + ","
print(csvline)
with open ('output_file_two.csv', 'a') as f:
writer = csv.writer(f)
writer.writerow(csvline)
EDIT
It's working, here's the code working
import shlex
import subprocess
import os
import platform
from bs4 import BeautifulSoup
import re
import unicodecsv as csv
import requests
from robobrowser import RoboBrowser
import codecs
def rename_files():
file_list = os.listdir(r"C:\\PROJECT\\pdfs")
print(file_list)
saved_path = os.getcwd()
print('Current working directory is '+saved_path)
os.chdir(r'C:\\PROJECT\\pdfs')
for file_name in file_list:
os.rename(file_name, file_name.translate(None, " "))
os.chdir(saved_path)
rename_files()
def run(command):
if platform.system() != 'Windows':
args = shlex.split(command)
else:
args = command
s = subprocess.Popen(args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, errors = s.communicate()
return s.returncode == 0, output, errors
base_directory = 'C:\\PROJECT\\pdfs'
if not os.path.isdir(base_directory):
print "%s is not a directory" % base_directory
exit(1)
bin_path = 'C:\\Python27\\pdfminer-20140328\\tools\\pdf2txt.py'
if not os.path.isfile(bin_path):
print "Could not find %s" % bin_path
exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
for file_name in file_name_list:
if not file_name.endswith('.pdf'):
continue
file_path = os.path.join(dir_path, file_name)
args = (bin_path, file_name, file_path)
success, output, errors = run("python %s -o %s.html %s " %args)
if not success:
print "Could not convert %s to HTML" % file_path
print "%s" % errors
htmls_path = 'C:\\PROJECT'
with open ('score.csv', 'w') as f:
writer = csv.writer(f)
for dir_path, dir_name_list, file_name_list in os.walk(htmls_path):
for file_name in file_name_list:
if not file_name.endswith('.html'):
continue
with open(file_name) as markup:
soup = BeautifulSoup(markup.read())
text = soup.get_text()
match = re.findall("PA/(\S*)", text)
print(match)
writer.writerow(match)
for item in match:
data = item.split('/')
case_number = data[0]
case_year = data[1]
csvline = case_number + ","
browser = RoboBrowser()
browser.open('http://www.pa.org.mt/page.aspx?n=63C70E73&CaseType=PA')
form = browser.get_forms()[0]
form['ctl00$PageContent$ContentControl$ctl00$txtCaseNo'].value = case_number
form['ctl00$PageContent$ContentControl$ctl00$txtCaseYear'].value = case_year
browser.submit_form(form, submit=form['ctl00$PageContent$ContentControl$ctl00$btnSubmit'])
answer = browser.response.text
soup = BeautifulSoup(answer)
for table in soup.findAll('table', {"class":"formTable"}):
for row in table.findAll('tr'):
label = row.find('td', {"class":"fieldLabel"})
data = row.find('td', {"class":"fieldData"})
if data is not None and label is not None:
csvline += label.get_text() + "," + data.get_text() + ","
print(csvline)
my_file = codecs.open('final_output.csv', 'a', 'utf-8')
my_file.write(csvline)
At the end there is a problem with your code
writer = csv.writer(f)
csv.writer(csvline) # here is the problem
See you initialize the writer, but then you don't use it.
writer = csv.writer(f)
writer.writerow(csvline)
Here :
with open ('output_file_two.csv', 'a') as f:
writer = csv.writer(f)
csv.writer (csvline)
You are instanciating a csv.writer, but not using it. This should read:
with open ('output_file_two.csv', 'a') as f:
writer = csv.writer(f)
writer.write(csvline)
Now there are quite a few other problems with your code, the first one being to manually create the 'csvline as text then using csv.writer to store it to file. csv.writer.write() expects a list of rows (tuples) and takes care of properly escaping what needs to be escaped, inserting the proper delimiters etc. It also has a writerow() method that takes a single tuple and so avoid building the whole list in memory FWIW.

Throttling FTP download with Python ftplib

How do I throttle the FTP download with Python ftplib? For example put a cap on the speed to be 20Mb/s?
I'm using the following code to download files with Python ftplib:
from ftplib import FTP
import os
download_list = 'testlist.txt' # inital list of directories to be downloaded
path_list = [] # initalize a list of all the pathes from download_list
local_folder = 'testStorage' #where files are going to be downloaded to
downloaded_list = 'completedownload.txt' # list of completed downloads
error_list = 'incomplete_downloads.txt' # list of paths that are incomplete
ftp=FTP("ftp.address.com")
ftp.login("user_name","password") #login to FTP account
print "Successfully logged in"
# make a list of files to download from a file
with open(download_list, 'r') as f:
content = f.readlines()
path_list = [x.strip() for x in content]
for path in path_list:
path = path.replace("*","") # strips the * found in the source file
print '\nChanging directory to ' + path + ':\n'
#ftp.cwd('/AAA/BBB/CCC/logic-1/') #the format to change into path note the * is omitted
#if ftp.cwd(path) == True:
try: # tries the path in the file
ftp.cwd(path)
#ftp.retrlines('LIST')
filenames = ftp.nlst()
for filename in filenames:
local_directory = local_folder+path # create the local path ie : testStorage/AAA/BBB/CCC/logic-1/
local_filename = os.path.join(local_directory,filename) #
if os.path.exists(local_filename) == False: # checks if file already exists
if not os.path.exists(local_directory): # mimic the remote path locally
os.makedirs(local_directory)
file = open(local_filename,'wb')
ftp.retrbinary('RETR '+ filename, file.write)
print filename
file.close()
elif os.path.exists(local_filename) == True: # skip the file if it exits
print 'File ' +filename + ' already exists, skipping this file'
except: #if path in text file does not exist write to error_list.txt
print 'Path ' + path + ' does not exist writing path to error_list.txt'
with open(error_list, 'a') as f2:
f2.write(path+'\n')
continue
print "all done closing connection"
ftp.close() #CLOSE THE FTP CONNECTION
To throttle the download, just implement a function that does file.write and time.sleep as needed. Pass that function to ftp.retrbinary as callback (instead of file.write directly).
This pseudo code (I do not do Python) should give you some idea:
total_length = 0
start_time = time.time()
def write_and_sleep(buf):
global file
global total_length
global start_time
file.write(buf)
total_length += sys.getsizeof(buf)
while (total_length / (time.time() - start_time)) > 100000000:
time.sleep(0.1)
ftp.retrbinary('RETR '+ filename, write_and_sleep)
Reducing maxblocksize (the 3rd argument of ftp.retrbinary) may help achieving more smooth "download curve".

Python (3.5) - Constructing String to Save File - String Contains Escape Characters

I am using Python (3.5) to loop through some .msg files, extract data from them, which contains a url to download a file and a folder that the file should go into. I have successfully extracted the data from the .msg file but now when I try to piece together the absolute file path for the downloaded file, the format ends up weird, with backslashes and \t\r.
Here's a shortened view of the code:
for file in files:
file_abs_path = script_dir + '/' + file
print(file_abs_path)
outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")
msg = outlook.OpenSharedItem(file_abs_path)
pattern = re.compile(r'(?:^|(?<=\n))[^:<\n]*[:<]\s*([^>\n]*)', flags=re.DOTALL)
results = pattern.findall(msg.Body)
# results[0] -> eventID
regexID = re.compile(r'^[^\/\s]*', flags=re.DOTALL)
filtered = regexID.findall(results[0])
eventID = filtered[0]
# print(eventID)
# results[1] -> title
title = results[1].translate(str.maketrans('','',string.punctuation)).replace(' ', '_') #results[1]
title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore')
title = title.decode('UTF-8')
#results[1]
print(title)
# results[2] -> account
regexAcc = re.compile(r'^[^\(\s]*', flags=re.DOTALL)
filtered = regexAcc.findall(results[2])
account = filtered[0]
account = unicodedata.normalize('NFKD', account).encode('ascii', 'ignore')
account = account.decode('UTF-8')
# print(account)
# results[3] -> downloadURL
downloadURL = results[3]
# print(downloadURL)
rel_path = account + '/' + eventID + '_' + title + '.mp4'
rel_path = unicodedata.normalize('NFKD', rel_path).encode('ascii', 'ignore')
rel_path = rel_path.decode('UTF-8')
filename_abs_path = os.path.join(script_dir, rel_path)
# Download .mp4 from a url and save it locally under `file_name`:
with urllib.request.urlopen(downloadURL) as response, open(filename_abs_path, 'wb') as out_file:
shutil.copyfileobj(response, out_file)
# print item [ID - Title] when done
print('[Complete] ' + eventID + ' - ' + title)
del outlook, msg
So as you can see I have some regex that extracts 4 pieces of data from the .msg. Then I have to go through each one and do some further fine tuning, but then have what I need:
eventID
# 123456
title
# Name_of_item_with_underscord_no_punctuation
account
# nameofaccount
downloadURL
# http://download.com/basicurlandfile.mp4
So this is the data I get, and I've print() it off and it doesn't have any weird characters. But when I try to construct the path for the .mp4 (filename and directory):
downloadURL = results[3]
# print(downloadURL)
rel_path = account + '/' + eventID + '_' + title + '.mp4'
rel_path = unicodedata.normalize('NFKD', rel_path).encode('ascii', 'ignore')
rel_path = rel_path.decode('UTF-8')
filename_abs_path = os.path.join(script_dir, rel_path)
# Download .mp4 from a url and save it locally under `file_name`:
with urllib.request.urlopen(downloadURL) as response, open(filename_abs_path, 'wb') as out_file:
shutil.copyfileobj(response, out_file)
After doing this, the output I get from running the code is:
Traceback (most recent call last): File "sfaScript.py", line 65, in <module> with urllib.request.urlopen(downloadURL) as response, open(filename_abs_path, 'wb') as out_file:
OSError: [Errno 22] Invalid argument: 'C:/Users/Kenny/Desktop/sfa_kenny_batch_1\\accountnamehere/123456_Name_of_item_with_underscord_no_punctuation\t\r.mp4'
TL;DR - QUESTION
So the filename_abs_path somehow got changed to
C:/Users/Kenny/Desktop/sfa_kenny_batch_1\\accountnamehere/123456_Name_of_item_with_underscord_no_punctuation\t\r.mp4
I need it to be
C:/Users/Kenny/Desktop/sfa_kenny_batch_1/accountnamehere/123456_Name_of_item_with_underscord_no_punctuation.mp4
Thanks for any help provided!
Looks like your regex captured a tabulation char (\t) and a linefeed char (\r) in title
A quickfix for this would be:
title = title.strip()
(before composing the filename)
which removes all "blank" chars, including tabulations and carriage return chars.

Categories

Resources