Good day, I need to test this function using request_mock:
def loader(link, output='os.getcwd'):
# Function loading a page from the link
response = requests.get(link)
data = response.text
file_name = modify_file_name(link)
if output == 'os.getcwd':
directory = os.getcwd()
else:
directory = output
filepath = os.path.join(directory, file_name + '.html')
with open(filepath, 'w') as page:
page.write(data)
return filepath
I didn't find a tutorial of instruction for beginners. I would be grateful for advice.
Related
with open('URLS/Gibiru_urls.txt', 'r') as urls:
for url in urls.readlines():
url = url.rstrip("\n")
download_url(url)
I want to read urls from .txt file in one directory (Root/URLS/Gibiru_urls.txt) and output into another directory (Root/Images/Gibiru_pics). My python file is located in (Root)
def download_url(file_url):
print("downloading: ",file_url)
file_name_start_pos = file_url.rfind("/") + 1
file_name = file_url[file_name_start_pos:]
os.system("cd Images/Gibiru_pics")
r = requests.get(file_url, stream=True)
if r.status_code == requests.codes.ok:
with open(file_name, 'wb') as f:
for data in r:
f.write(data)
I was able to re-direct. It was the os.chdir() method I was looking for.
def Gibiru():
output_dir = '/multiple_image_gathering-main/Images/Gibiru_pics'
with open('URLS/Gibiru_urls.txt', 'r') as urls:
for url in urls.readlines():
url = url.rstrip("\n")
download_url(url, output_dir)
def download_url(file_url, output_dir):
os.chdir(output_dir)
After downloading the dependencies from nexus, I have a download path for the data to be in, but I wasn't able to open the textile its not responding, why is this so?
for item in data["items"]:
for asset in item["assets"]:
fileurl = asset["downloadUrl"]
print(fileurl)
downloadPath = '/home/centos/'
filename = downloadPath + fileurl.split('/')[-1]# '\' for Windows
outfile = open(filename, "w")
outfile.write(str(urllib.request.urlopen(fileurl).read()))
outfile.close()
if data["continuationToken"] is None:
sys.exit()
else:
#construct pagination url and loop
url = baseurl + 'components?continuationToken=' + data["continuationToken"] + '&repository=' + downloadRepository
return
I'm fairly new to using Python. I have been trying to set up a very basic web scraper to help speed up my workday, it is supposed to download images from a section of a website and save them.
I have a list of urls and I am trying to use urllib.request.urlretrieve to download all the images.
The output location (savepath) updates so it adds 1 to the current highest number in the folder.
I've tried a bunch of different ways but urlretrieve only saves the image from the last url in the list. Is there a way to download all the images in the url list?
to_download=['url1','url2','url3','url4']
for t in to_download:
urllib.request.urlretrieve(t, savepath)
This is the code I was trying to use to update the savepath every time
def getNextFilePath(photos):
highest_num = 0
for f in os.listdir(photos):
if os.path.isfile(os.path.join(photos, f)):
file_name = os.path.splitext(f)[0]
try:
file_num = int(file_name)
if file_num > highest_num:
highest_num = file_num
except ValueError:
'The file name "%s" is not an integer. Skipping' % file_name
output_file = os.path.join(output_folder, str(highest_num + 1))
return output_file
as suggested by #vks, you need to update savepath (otherwise you save each url onto the same file). One way to do so, is to use enumerate:
from urllib import request
to_download=['https://edition.cnn.com/','https://edition.cnn.com/','https://edition.cnn.com/','https://edition.cnn.com/']
for i, url in enumerate(to_download):
save_path = f'website_{i}.txt'
print(save_path)
request.urlretrieve(url, save_path)
which you may want to contract into:
from urllib import request
to_download=['https://edition.cnn.com/','https://edition.cnn.com/','https://edition.cnn.com/','https://edition.cnn.com/']
[request.urlretrieve(url, f'website_{i}.txt') for i, url in enumerate(to_download)]
see:
Python3 doc: Python enumerate doc
Example of enumerate: enumerate example
Example of f' using a string with a {variable}': f string example
FOR SECOND PART OF THE QUESTION:
Not sure what you are trying to achieve but:
def getNextFilePath(photos):
file_list = os.listdir(photos)
file_list = [int(s) for s in file_list if s.isdigit()]
print(file_list)
max_id_file = max(file_list)
print(f'max id:{max_id_file}')
output_file = os.path.join(output_folder, str(max_id_file + 1))
print(f'output file path:{output_file}')
return output_file
this will hopefully find all files that are named with digits (IDs), and find the highest ID, and return a new file name as a max_id+1
I guess that this will replace the save_path in your example.
Which quickly coding, AND MODIFYING above function, so that it returns the max_id and not the path.
The bellow code be a working example using the iterrator:
import os
from urllib import request
photo_folder = os.path.curdir
def getNextFilePath(photos):
file_list = os.listdir(photos)
print(file_list)
file_list = [int(os.path.splitext(s)[0]) for s in file_list if os.path.splitext(s)[0].isdigit()]
if not file_list:
return 0
print(file_list)
max_id_file = max(file_list)
#print(f'max id:{max_id_file}')
#output_file = os.path.join(photo_folder, str(max_id_file + 1))
#print(f'output file path:{output_file}')
return max_id_file
def download_pic(to_download):
start_id = getNextFilePath(photo_folder)
for i, url in enumerate(to_download):
save_path = f'{i+start_id}.png'
output_file = os.path.join(photo_folder, save_path)
print(output_file)
request.urlretrieve(url, output_file)
You should add handling exception etc, but this seems to be working, if I understood correctly.
Are you updating savepath? If you pass the same savepath to each loop iteration, it is likely just overwriting the same file over and over.
Hope that helps, happy coding!
I am trying to download and save in a folder all the PDFs contained in some webs with dynamic elements i.e: https://www.bankinter.com/banca/nav/documentos-datos-fundamentales
Every PDF in this url have similar href. Here they are two of them:
"https://bancaonline.bankinter.com/publico/DocumentacionPrixGet?doc=workspace://SpacesStore/fb029023-dd29-47d5-8927-31021d834757;1.0&nameDoc=ISIN_ES0213679FW7_41-Bonos_EstructuradosGarantizad_19.16_es.pdf"
"https://bancaonline.bankinter.com/publico/DocumentacionPrixGet?doc=workspace://SpacesStore/852a7524-f21c-45e8-a8d9-1a75ce0f8286;1.1&nameDoc=20-Dep.Estruc.Cont.Financieros_18.1_es.pdf"
Here it is what I did for another web, this code is working as desired:
link = 'https://www.bankia.es/estaticos/documentosPRIIPS/json/jsonSimple.txt'
base = 'https://www.bankia.es/estaticos/documentosPRIIPS/{}'
dirf = os.environ['USERPROFILE'] + "\Documents\TFM\PdfFolder"
if not os.path.exists(dirf2):os.makedirs(dirf2)
os.chdir(dirf2)
res = requests.get(link,headers={"User-Agent":"Mozilla/5.0"})
for item in res.json():
if not 'nombre_de_fichero' in item: continue
link = base.format(item['nombre_de_fichero'])
filename_bankia = item['nombre_de_fichero'].split('.')[-2] + ".PDF"
with open(filename_bankia, 'wb') as f:
f.write(requests.get(link).content)
You have to make a post http requests with appropriate json parameter. Once you get the response, you have to parse two fields objectId and nombreFichero to use them to build right links to the pdf's. The following should work:
import os
import json
import requests
url = 'https://bancaonline.bankinter.com/publico/rs/documentacionPrix/list'
base = 'https://bancaonline.bankinter.com/publico/DocumentacionPrixGet?doc={}&nameDoc={}'
payload = {"cod_categoria": 2,"cod_familia": 3,"divisaDestino": None,"vencimiento": None,"edadActuarial": None}
dirf = os.environ['USERPROFILE'] + "\Desktop\PdfFolder"
if not os.path.exists(dirf):os.makedirs(dirf)
os.chdir(dirf)
r = requests.post(url,json=payload)
for item in r.json():
objectId = item['objectId']
nombreFichero = item['nombreFichero'].replace(" ","_")
filename = nombreFichero.split('.')[-2] + ".PDF"
link = base.format(objectId,nombreFichero)
with open(filename, 'wb') as f:
f.write(requests.get(link).content)
After executing the above script, wait a little for it to work as the site is real slow.
I am trying to create a recursive file structure, but when I try to create a file, it creates the first folder but no further folders.
import urllib2
from bs4 import BeautifulSoup
import shutil
import os
ext = [".html", ".jpeg", ".png", ".gif", ".jpg"]
def findLinks( url, newPath ):
resp=urllib2.urlopen(url) #open first link
if resp.getcode() == 200:
if "text/html" in resp.headers["content-type"]:
s = BeautifulSoup(resp.read(), "html.parser")
links = s.find_all( 'a' ) #put all a links into links list
for link in links:
f = link['href']
print f
newDir = newPath+f
if not os.path.isdir(newDir): #if doesn't already exist
if not newDir.endswith(tuple(ext)): #if nota file
os.makedirs(newDir) # create all directories
if newDir.endswith(".html"):
newFile = open(newDir, 'w+')
newFile.write("sample text")
newFile.close()
return links
findLinks('http://localhost/onlinecontent/Test', '/Test' )