Imagine you have the following list:
listfiles =
['https://yourubl.nl/wp-content/uploads/elementor/forms/60916b7e4f600.pdf', 'https://yourubl.nl/wp-content/uploads/elementor/forms/60916d04e0d70.pdf', 'https://yourubl.nl/wp-content/uploads/elementor/forms/60917c5a5c95f.pdf']
# here we are importing the library
import requests
from requests.auth import HTTPBasicAuth
# making the request
listfiles = []
for url in url_list:
response = requests.get(url, auth=HTTPBasicAuth('c101838', 'HQSRynw9'))
listfiles.append(response)
print(listfiles)
for files in listfiles:
for kvk in kvk_list:
with open(kvk + '.pdf', 'wb') as f:
f.write(files.content)
Now I want to write each of the responses to a pdf file.
kvk_list = ['88888888', '9999999', '4444444']
url_list = [<Response [200]>, <Response [200]>, <Response [200]>]
However, I'm only getting the last pdf file in all three outputs....
How is this possible?
Please help!
Use a zip() function to tie the kvk_list and listfiles together:
for kvk, files in zip(kvk_list, listfiles):
with open(kvk + '.pdf', 'wb') as f:
f.write(files.content)
Related
I am trying to download multiple .csv files from an url directory with similar names (AB_daily.csv, BC_daily.csv, etc.). However, each file is stored in different folders in the directory. I know there is a way to use a loop to extract the files, but I can't figure out how to do it with Beautiful Soup or glob. Do you have any suggestions? I've also used pandas.read_csv() to look for shortcuts as I'm just trying to concatenate the files together later. Thank you.
URL Directory: https://dd.weather.gc.ca/hydrometric/csv/
import os
import requests
import urllib.request
from bs4 import BeautifulSoup
def main(url):
with requests.Session() as req:
r = req.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
target = [f"{url[:20]}{item['href']}" for item in soup.select(
"a[href$='AB_daily_hydrometric.csv']")
for x in target:
print(f"Downloading {x}")
r = req.get(x)
name = x.rsplit("/", 1)[-1]
with open(name, 'wb') as f:
f.write(r.content)
main('https://dd.weather.gc.ca/hydrometric/csv')
# For this specific task this will work:
import os
import requests
import urllib.request
from bs4 import BeautifulSoup
csv_links = []
links = ["AB/", "BC/", "MB/", "NB/", "NL/", "NS/", "NT/", "NU/" "ON/", " PE/", "QC/", "SK/", "YT/"]
def main(url):
with requests.Session() as req:
r = req.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
for item in links:
r_daily = BeautifulSoup(req.get(url + item + "daily/").content, 'html.parser')
r_hourly = BeautifulSoup(req.get(url + item + "hourly/").content, 'html.parser')
for item_d in r_daily.find_all('a'):
if ".csv" in item_d.get('href'):
csv_links.append(url + item + "daily/" +item_d.get('href'))
for item_h in r_hourly.find_all('a'):
if ".csv" in item_h.get('href'):
csv_links.append(url + item + "hourly/" + item_h.get('href'))
for x in csv_links:
print(f"Downloading {x}")
r = req.get(x)
name = x.rsplit("/", 1)[-1]
with open(name, 'wb') as f:
f.write(r.content)
main('https://dd.weather.gc.ca/hydrometric/csv/')
I have an excel list of DOIs of papers I'm interested in. Based on this list, I would like to download all the papers.
I tried to do it with request, as recommended in their documentation. But the pdf files I get are damaged. They are just some KB big. I changed the chunk_size several times from None till 1024*1024 and I have read many posts already. Nothing helps.
Please, what are your ideas?
import pandas as pd
import os
import requests
def get_pdf(doi, file_to_save_to):
url = 'http://api.elsevier.com/content/article/doi:'+doi+'?view=FULL'
headers = {
'X-ELS-APIKEY': "keykeykeykeykeykey",
'Accept': 'application/pdf'
}
r = requests.get(url, stream=True, headers=headers)
if r.status_code == 200:
for chunk in r.iter_content(chunk_size=1024*1024):
file_to_save_to.write(chunk)
return True
doi_list = pd.read_excel('list.xls')
doi_list.columns = ['DOIs']
count = 0
for doi in doi_list['DOIs']:
doi = doi.replace('DOI:','')
pdf = doi.replace('/','%')
if not os.path.exists(f'path/{pdf}.pdf'):
file = open(f'path/{pdf}.pdf', 'wb')
get_pdf(doi, file)
count += 1
print(f"Dowloaded: {count} of {len(doi_list['DOIs'])} articles")
I think your problem is the return True in for chunk in r.iter_content. With that line, you'll only ever write one chunk of the PDF of size chunk_size.
You should also open files using with; as is, you'll never close the file handles.
import pandas as pd
import os
import requests
HEADERS = {
'X-ELS-APIKEY': "keykeykeykeykeykey",
'Accept': 'application/pdf'
}
def get_pdf(doi, file_to_save_to):
url = f'http://api.elsevier.com/content/article/doi:{doi}?view=FULL'
with requests.get(url, stream=True, headers=HEADERS) as r:
if r.status_code == 200:
for chunk in r.iter_content(chunk_size=1024*1024):
file_to_save_to.write(chunk)
doi_list = pd.read_excel('list.xls')
doi_list.columns = ['DOIs']
count = 0
for doi in doi_list['DOIs']:
doi = doi.replace('DOI:','')
pdf = doi.replace('/','%')
if not os.path.exists(f'path/{pdf}.pdf'):
with open(f'path/{pdf}.pdf', 'wb') as file:
get_pdf(doi, file)
count += 1
print(f"Dowloaded: {count} of {len(doi_list['DOIs'])} articles")
I am trying to download and save in a folder all the PDFs contained in some webs with dynamic elements i.e: https://www.bankinter.com/banca/nav/documentos-datos-fundamentales
Every PDF in this url have similar href. Here they are two of them:
"https://bancaonline.bankinter.com/publico/DocumentacionPrixGet?doc=workspace://SpacesStore/fb029023-dd29-47d5-8927-31021d834757;1.0&nameDoc=ISIN_ES0213679FW7_41-Bonos_EstructuradosGarantizad_19.16_es.pdf"
"https://bancaonline.bankinter.com/publico/DocumentacionPrixGet?doc=workspace://SpacesStore/852a7524-f21c-45e8-a8d9-1a75ce0f8286;1.1&nameDoc=20-Dep.Estruc.Cont.Financieros_18.1_es.pdf"
Here it is what I did for another web, this code is working as desired:
link = 'https://www.bankia.es/estaticos/documentosPRIIPS/json/jsonSimple.txt'
base = 'https://www.bankia.es/estaticos/documentosPRIIPS/{}'
dirf = os.environ['USERPROFILE'] + "\Documents\TFM\PdfFolder"
if not os.path.exists(dirf2):os.makedirs(dirf2)
os.chdir(dirf2)
res = requests.get(link,headers={"User-Agent":"Mozilla/5.0"})
for item in res.json():
if not 'nombre_de_fichero' in item: continue
link = base.format(item['nombre_de_fichero'])
filename_bankia = item['nombre_de_fichero'].split('.')[-2] + ".PDF"
with open(filename_bankia, 'wb') as f:
f.write(requests.get(link).content)
You have to make a post http requests with appropriate json parameter. Once you get the response, you have to parse two fields objectId and nombreFichero to use them to build right links to the pdf's. The following should work:
import os
import json
import requests
url = 'https://bancaonline.bankinter.com/publico/rs/documentacionPrix/list'
base = 'https://bancaonline.bankinter.com/publico/DocumentacionPrixGet?doc={}&nameDoc={}'
payload = {"cod_categoria": 2,"cod_familia": 3,"divisaDestino": None,"vencimiento": None,"edadActuarial": None}
dirf = os.environ['USERPROFILE'] + "\Desktop\PdfFolder"
if not os.path.exists(dirf):os.makedirs(dirf)
os.chdir(dirf)
r = requests.post(url,json=payload)
for item in r.json():
objectId = item['objectId']
nombreFichero = item['nombreFichero'].replace(" ","_")
filename = nombreFichero.split('.')[-2] + ".PDF"
link = base.format(objectId,nombreFichero)
with open(filename, 'wb') as f:
f.write(requests.get(link).content)
After executing the above script, wait a little for it to work as the site is real slow.
I'm trying to download an image from a website but I get a 404 error. I tried to add a user agent with no sucess.
Here is the code:
import requests
import shutil
with open(r'C:\Users\home\Desktop\urls.csv') as file:
csv = []
for row in file:
csv.append(row.split(";"))
row = 0
while row < len(csv):
r = requests.get(csv[row][0], stream=True, headers={'User-agent': 'Mozilla/5.0'})
if r.status_code == 200:
with open(r"C:\Users\home\Desktop\images\house" + str(row) + ".jpg", 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
row +=1
The url is:
https://example.com/wp-content/uploads/2018/10/toronto-curbed-8.jpg
replace example by cdn.icepop
url="someurl"
outputfile='./file.zip'
link=urllib.urlopen(url)
soup= bs4.BeautifulSoup(link,'lxml')
links=[]
for data in soup.find_all('div', class_='master_content-outer-container'):
for a in data.find_all('a'):
links.append(a.get('href'))
output = open(outputfile, "wb")
for i in links:
request=urllib.urlopen(i)
read=request.read()
output.write(read)
output.close()
zip_ref= zipfile.ZipFile(outputfile,'r')
zip_ref.extractall('./data/')
zip_ref.close()
I have a url's stored in a list. I am supplying it to urllib. Each url ends with .zip extension. When I run this code I get only the last file downloaded from the list. There are about >400 links to be downloaded.
Am I missing something?
So you write all you files into one, that's not gonna work
Try this
import os
url="someurl"
outputfile='./file.zip'
link=urllib.urlopen(url)
soup= bs4.BeautifulSoup(link,'lxml')
links=[]
for data in soup.find_all('div', class_='master_content-outer-container'):
for a in data.find_all('a'):
links.append(a.get('href'))
for i in links:
request=urllib.urlopen(i)
read=request.read()
file_name = os.path.basename(i)
output = open(file_name, "wb")
output.write(read)
output.close()
zip_ref= zipfile.ZipFile(file_name,'r')
zip_ref.extractall('./data/')
zip_ref.close()
Option 2
import os
url="someurl"
outputfile='./file.zip'
link=urllib.urlopen(url)
soup= bs4.BeautifulSoup(link,'lxml')
def download_and_extract(link):
request=urllib.urlopen(link)
read=request.read()
file_name = os.path.basename(link)
output = open(file_name, "wb")
output.write(read)
output.close()
zip_ref= zipfile.ZipFile(file_name,'r')
zip_ref.extractall('./data/')
zip_ref.close()
for data in soup.find_all('div', class_='master_content-outer-container'):
for a in data.find_all('a'):
download_and_extract(a.get('href'))