How to download and save all PDF from a dynamic web?

How to download and save all PDF from a dynamic web? - python

I am trying to download and save in a folder all the PDFs contained in some webs with dynamic elements i.e: https://www.bankinter.com/banca/nav/documentos-datos-fundamentales
Every PDF in this url have similar href. Here they are two of them:
"https://bancaonline.bankinter.com/publico/DocumentacionPrixGet?doc=workspace://SpacesStore/fb029023-dd29-47d5-8927-31021d834757;1.0&nameDoc=ISIN_ES0213679FW7_41-Bonos_EstructuradosGarantizad_19.16_es.pdf"
"https://bancaonline.bankinter.com/publico/DocumentacionPrixGet?doc=workspace://SpacesStore/852a7524-f21c-45e8-a8d9-1a75ce0f8286;1.1&nameDoc=20-Dep.Estruc.Cont.Financieros_18.1_es.pdf"
Here it is what I did for another web, this code is working as desired:
link = 'https://www.bankia.es/estaticos/documentosPRIIPS/json/jsonSimple.txt'
base = 'https://www.bankia.es/estaticos/documentosPRIIPS/{}'
dirf = os.environ['USERPROFILE'] + "\Documents\TFM\PdfFolder"
if not os.path.exists(dirf2):os.makedirs(dirf2)
os.chdir(dirf2)
res = requests.get(link,headers={"User-Agent":"Mozilla/5.0"})
for item in res.json():
if not 'nombre_de_fichero' in item: continue
link = base.format(item['nombre_de_fichero'])
filename_bankia = item['nombre_de_fichero'].split('.')[-2] + ".PDF"
with open(filename_bankia, 'wb') as f:
f.write(requests.get(link).content)

You have to make a post http requests with appropriate json parameter. Once you get the response, you have to parse two fields objectId and nombreFichero to use them to build right links to the pdf's. The following should work:
import os
import json
import requests
url = 'https://bancaonline.bankinter.com/publico/rs/documentacionPrix/list'
base = 'https://bancaonline.bankinter.com/publico/DocumentacionPrixGet?doc={}&nameDoc={}'
payload = {"cod_categoria": 2,"cod_familia": 3,"divisaDestino": None,"vencimiento": None,"edadActuarial": None}
dirf = os.environ['USERPROFILE'] + "\Desktop\PdfFolder"
if not os.path.exists(dirf):os.makedirs(dirf)
os.chdir(dirf)
r = requests.post(url,json=payload)
for item in r.json():
objectId = item['objectId']
nombreFichero = item['nombreFichero'].replace(" ","_")
filename = nombreFichero.split('.')[-2] + ".PDF"
link = base.format(objectId,nombreFichero)
with open(filename, 'wb') as f:
f.write(requests.get(link).content)
After executing the above script, wait a little for it to work as the site is real slow.

Related

urllib urlretrieve only saving final image in list of urls

I'm fairly new to using Python. I have been trying to set up a very basic web scraper to help speed up my workday, it is supposed to download images from a section of a website and save them.
I have a list of urls and I am trying to use urllib.request.urlretrieve to download all the images.
The output location (savepath) updates so it adds 1 to the current highest number in the folder.
I've tried a bunch of different ways but urlretrieve only saves the image from the last url in the list. Is there a way to download all the images in the url list?
to_download=['url1','url2','url3','url4']
for t in to_download:
urllib.request.urlretrieve(t, savepath)
This is the code I was trying to use to update the savepath every time
def getNextFilePath(photos):
highest_num = 0
for f in os.listdir(photos):
if os.path.isfile(os.path.join(photos, f)):
file_name = os.path.splitext(f)[0]
try:
file_num = int(file_name)
if file_num > highest_num:
highest_num = file_num
except ValueError:
'The file name "%s" is not an integer. Skipping' % file_name
output_file = os.path.join(output_folder, str(highest_num + 1))
return output_file

as suggested by #vks, you need to update savepath (otherwise you save each url onto the same file). One way to do so, is to use enumerate:
from urllib import request
to_download=['https://edition.cnn.com/','https://edition.cnn.com/','https://edition.cnn.com/','https://edition.cnn.com/']
for i, url in enumerate(to_download):
save_path = f'website_{i}.txt'
print(save_path)
request.urlretrieve(url, save_path)
which you may want to contract into:
from urllib import request
to_download=['https://edition.cnn.com/','https://edition.cnn.com/','https://edition.cnn.com/','https://edition.cnn.com/']
[request.urlretrieve(url, f'website_{i}.txt') for i, url in enumerate(to_download)]
see:
Python3 doc: Python enumerate doc
Example of enumerate: enumerate example
Example of f' using a string with a {variable}': f string example
FOR SECOND PART OF THE QUESTION:
Not sure what you are trying to achieve but:
def getNextFilePath(photos):
file_list = os.listdir(photos)
file_list = [int(s) for s in file_list if s.isdigit()]
print(file_list)
max_id_file = max(file_list)
print(f'max id:{max_id_file}')
output_file = os.path.join(output_folder, str(max_id_file + 1))
print(f'output file path:{output_file}')
return output_file
this will hopefully find all files that are named with digits (IDs), and find the highest ID, and return a new file name as a max_id+1
I guess that this will replace the save_path in your example.
Which quickly coding, AND MODIFYING above function, so that it returns the max_id and not the path.
The bellow code be a working example using the iterrator:
import os
from urllib import request
photo_folder = os.path.curdir
def getNextFilePath(photos):
file_list = os.listdir(photos)
print(file_list)
file_list = [int(os.path.splitext(s)[0]) for s in file_list if os.path.splitext(s)[0].isdigit()]
if not file_list:
return 0
print(file_list)
max_id_file = max(file_list)
#print(f'max id:{max_id_file}')
#output_file = os.path.join(photo_folder, str(max_id_file + 1))
#print(f'output file path:{output_file}')
return max_id_file
def download_pic(to_download):
start_id = getNextFilePath(photo_folder)
for i, url in enumerate(to_download):
save_path = f'{i+start_id}.png'
output_file = os.path.join(photo_folder, save_path)
print(output_file)
request.urlretrieve(url, output_file)
You should add handling exception etc, but this seems to be working, if I understood correctly.

Are you updating savepath? If you pass the same savepath to each loop iteration, it is likely just overwriting the same file over and over.
Hope that helps, happy coding!

Getting the: " FileNotFoundError: [Errno 2] No such file or directory: 'posted.txt '" even though I do have such file

Hey guys so I am working on a twitter bot that takes posts from reddit and tweets them. My problem right now is when I run it I get the " FileNotFoundError: [Errno 2] No such file or directory: 'posted.txt '" error.
But the thing is as you will see in the image below and through my
code, 'posted.txt' does exist and it is in the same directory. So I am kind of stuck on what the
actual problem is. I have a nearly identical program to this where the
'Already_Tweeted' function works but the only difference is that this
one takes in image files as well (using BeautifulSoup). Could that be contributing to this
error?
This is not the complete project, only what is hopefully relevant
import praw
import tweepy
import time
import os
from bs4 import BeautifulSoup as bs
import requests
posted_reddit_ids = 'posted.txt'
def tweet_creator(subreddit_info):
'''Goes through posts on reddit and extracts a shortened link, title & ID'''
post_links = [] #list to store our links
post_titles = [] #list to store our titles
post_ids = [] #list to store our id's
post_imgs = []
print("[bot] extracting posts from sub-reddit")
for submission in subreddit_info.new(limit=5):
if not already_tweeted(submission.id):
post_titles.append(submission.title)
post_links.append(submission.shortlink)
post_ids.append(submission.id)
post_imgs = get_image(submission.url)
print(post_imgs)
else:
print("Already Tweeted")
return post_links, post_titles, post_ids, post_imgs
def already_tweeted(id):
'''reads through our .txt file and determines if tweet has already been posted'''
found = 0
with open(posted_reddit_ids, 'r') as f:
for line in f:
if id in line:
found = 1
break
return found
def main():
'''Main function'''
# If the tweet tracking file does not already exist, create it
if not os.path.exists(posted_reddit_ids):
with open(posted_reddit_ids, 'w'):
pass
if not os.path.exists(img_dir):
os.makedirs(img_dir)
subreddit = setup_connection_reddit(subreddit_to_watch)
post_links, post_titles, post_ids, post_imgs = tweet_creator(subreddit)
tweeter(post_links, post_titles, post_ids,post_imgs)
if __name__ == '__main__':
main()
To show the file and program are in the same directory
Edit:
It seems the error completely goes away when I remove the post_imgs = get_image(submission.url)
Here is my code for the get_image function, maybe this can help solve my problem
def get_image(img_url):
url = img_url
r = requests.get(url, headers = {'User-Agent' : 'reddit Twitter tool monitoring (by /u/RivianJourneyMan)'})
data = r.text
soup = bs(data, 'lxml')
image_tags = soup.findAll('img')
os.chdir(img_dir)
x = 0
mylist = []
for image in image_tags:
try:
url = image['src']
source = requests.get(url, stream = True)
if source.status_code == 200:
img_file = img_dir + str(x) + '.jpg'
with open(img_file, 'wb') as f:
f.write(requests.get(url).content)
mylist.append(img_file)
f.close()
x += 1
return img_file
except:
mylist.append(None)
print(mylist)
return mylist
What I am trying to accomplish here, is return a list of .jpg files in the get_image function and then copy that list over to the post_imgs in the tweet_creator function.

I use pathlib instead of module os, and it did not raise the FileNotFoundError:
#!/usr/bin/env python3.6
import time
import praw
import requests
import tweepy
from bs4 import BeautifulSoup as bs
from pathlib import Path
posted_reddit_ids = "posted.txt"
def tweet_creator(subreddit_info):
"""Goes through posts on reddit and extracts a shortened link, title & ID"""
post_links = [] # list to store our links
post_titles = [] # list to store our titles
post_ids = [] # list to store our id's
post_imgs = []
print("[bot] extracting posts from sub-reddit")
for submission in subreddit_info.new(limit=5):
if not already_tweeted(submission.id):
post_titles.append(submission.title)
post_links.append(submission.shortlink)
post_ids.append(submission.id)
post_imgs = get_image(submission.url)
print(post_imgs)
else:
print("Already Tweeted")
return post_links, post_titles, post_ids, post_imgs
def already_tweeted(id):
"""reads through our .txt file and determines if tweet has already been posted"""
return id in Path(posted_reddit_ids).read_text()
def main():
"""Main function"""
# If the tweet tracking file does not already exist, create it
Path(posted_reddit_ids).exists() or Path(posted_reddit_ids).write_text("")
Path(img_dir).exists() or Path(img_dir).mkdir(parents=True)
subreddit = setup_connection_reddit(subreddit_to_watch)
post_links, post_titles, post_ids, post_imgs = tweet_creator(subreddit)
tweeter(post_links, post_titles, post_ids, post_imgs)
if __name__ == "__main__":
main()

Copy cell images from Smartsheet using Python

I am trying to make a copy of smart sheet data on my local disk. I am able to copy all the smart sheet data except for the cell images. Below is the code am using. This code works perfectly fine to copy the data but not the cell images
NOTE: I am not trying to copy the attachments from smart sheets; only the cell the images and data.
Could someone help me to enhance this code to copy the cell images as well?
import json
import os
import requests
import time
token = "Bearer <TOken>"
backed_up_sheets = {"Attach": 86960044478894,"test2":6659760455684}
dir = r'C:\Users\\me\SmartSheetsBackup\WorkSheet' + time.strftime("-%m_%d_%Y_%H_%M")
API_URL = "https://api.smartsheet.com/2.0/sheets/"
payload = {"Authorization": token,
"Accept": "application/vnd.ms-excel,image/*"}
amount = len(backed_up_sheets)
i = 1
for el in backed_up_sheets:
r = requests.get(API_URL + str(backed_up_sheets[el]) , headers=payload)
if r.status_code != 200:
print ('Some problem with connections please retry later0')
pass
if not os.path.exists(dir):
os.makedirs(dir)
with open(dir + el + time.strftime("-%m_%d_%Y_%H_%M") + ".xlsx", 'wb') as output:
output.write(r.content)
print ('Progress in sheets: ' + str(i) + '/' + str(amount))
i += 1

Here's a complete code sample:
# Download an image in a cell
def download_cell_image(client, sheet_id, row_id, column_id, default_filename):
# Get desired row
row = client.Sheets.get_row(sheet_id, row_id)
cell = row.get_column(column_id)
image = cell.image
filename = getattr(image, 'alt_text', default_filename)
# Obtain a temporary image URL
imageUrl = ss_client.models.ImageUrl( { "imageId": image.id } )
response = ss_client.Images.get_image_urls([imageUrl])
url = response.image_urls[0].url
# Download the image
import requests
response = requests.get(url)
if response.status_code == 200:
with open(filename, 'wb') as f:
f.write(response.content)
Note that this requires SDK version 1.3.0 or later

The same steps illustrated in the cURL example should work in Python. (Apologies that we don't have an complete published sample)
Get the image id from the cell object, as returned from get_sheet
Convert the image id to a download url, using images.get_image_urls (docs)
Download the image from the url, probably using the Requests library.

Downloading target link html to text files

I am completely new to python and studying Web crawling.
I am trying to download individual target link in text pages.
So far, I succeeded to extract all the target URLs I need, but have no idea on how to download all target HTML texts in multiple files. The code below just shows same article in multiple files.
Can someone help me please.
url = ""
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data, "lxml")
link1 = soup2.find_all('a', href=re.compile("drupal_lists"))
for t1 in link1:
print(t1.attrs['href'])
link_data = requests.get(t.attrs['href']).text
import io
for i in link_data:
link_data
with io.open("file_" + str(i) + ".txt", 'w', encoding='utf-8') as f:
f.write(str(i)+link_data)

In the style of your code, starting from the point when things change:
for i, t1 in enumerate(link1): # Get indices and data in one go
link_data = requests.get(t1.attrs['href']).text
with io.open("file_" + str(i) + ".txt", 'w', encoding='utf-8') as f:
f.write(link_data) # no str(i) because that would mess with the HTML

BeautifulSoup - scraping a forum page

I'm trying to scrape a forum discussion and export it as a csv file, with rows such as "thread title", "user", and "post", where the latter is the actual forum post from each individual.
I'm a complete beginner with Python and BeautifulSoup so I'm having a really hard time with this!
My current problem is that all the text is split into one character per row in the csv file. Is there anyone out there who can help me out? It would be fantastic if someone could give me a hand!
Here's the code I've been using:
from bs4 import BeautifulSoup
import csv
import urllib2
f = urllib2.urlopen("https://silkroad5v7dywlc.onion.to/index.php?action=printpage;topic=28536.0")
soup = BeautifulSoup(f)
b = soup.get_text().encode("utf-8").strip() #the posts contain non-ascii words, so I had to do this
writer = csv.writer(open('silkroad.csv', 'w'))
writer.writerows(b)

Ok here we go. Not quite sure what I'm helping you do here, but hopefully you have a good reason to be analyzing silk road posts.
You have a few issues here, the big one is that you aren't parsing the data at all. What you're essentially doing with .get_text() is going to the page, highlighting the whole thing, and then copying and pasting the whole thing to a csv file.
So here is what you should be trying to do:
Read the page source
Use soup to break it into sections you want
Save sections in parallel arrays for author, date, time, post, etc
Write data to csv file row by row
I wrote some code to show you what that looks like, it should do the job:
from bs4 import BeautifulSoup
import csv
import urllib2
# get page source and create a BeautifulSoup object based on it
print "Reading page..."
page = urllib2.urlopen("https://silkroad5v7dywlc.onion.to/index.php?action=printpage;topic=28536.0")
soup = BeautifulSoup(page)
# if you look at the HTML all the titles, dates,
# and authors are stored inside of <dt ...> tags
metaData = soup.find_all("dt")
# likewise the post data is stored
# under <dd ...>
postData = soup.find_all("dd")
# define where we will store info
titles = []
authors = []
times = []
posts = []
# now we iterate through the metaData and parse it
# into titles, authors, and dates
print "Parsing data..."
for html in metaData:
text = BeautifulSoup(str(html).strip()).get_text().encode("utf-8").replace("\n", "") # convert the html to text
titles.append(text.split("Title:")[1].split("Post by:")[0].strip()) # get Title:
authors.append(text.split("Post by:")[1].split(" on ")[0].strip()) # get Post by:
times.append(text.split(" on ")[1].strip()) # get date
# now we go through the actual post data and extract it
for post in postData:
posts.append(BeautifulSoup(str(post)).get_text().encode("utf-8").strip())
# now we write data to csv file
# ***csv files MUST be opened with the 'b' flag***
csvfile = open('silkroad.csv', 'wb')
writer = csv.writer(csvfile)
# create template
writer.writerow(["Time", "Author", "Title", "Post"])
# iterate through and write all the data
for time, author, title, post in zip(times, authors, titles, posts):
writer.writerow([time, author, title, post])
# close file
csvfile.close()
# done
print "Operation completed successfully."
EDIT: Included solution that can read files from directory and use data from that
Okay, so you have your HTML files in a directory. You need to get a list of files in the directory, iterate through them, and append to your csv file for each file in the directory.
This is the basic logic of our new program.
If we had a function called processData() that took a file path as an argument and appended data from the file to your csv file here is what it would look like:
# the directory where we have all our HTML files
dir = "myDir"
# our csv file
csvFile = "silkroad.csv"
# insert the column titles to csv
csvfile = open(csvFile, 'wb')
writer = csv.writer(csvfile)
writer.writerow(["Time", "Author", "Title", "Post"])
csvfile.close()
# get a list of files in the directory
fileList = os.listdir(dir)
# define variables we need for status text
totalLen = len(fileList)
count = 1
# iterate through files and read all of them into the csv file
for htmlFile in fileList:
path = os.path.join(dir, htmlFile) # get the file path
processData(path) # process the data in the file
print "Processed '" + path + "'(" + str(count) + "/" + str(totalLen) + ")..." # display status
count = count + 1 # increment counter
As it happens our processData() function is more or less what we did before, with a few changes.
So this is very similar to our last program, with a few small changes:
We write the column headers first thing
Following that we open the csv with the 'ab' flag to append
We import os to get a list of files
Here's what that looks like:
from bs4 import BeautifulSoup
import csv
import urllib2
import os # added this import to process files/dirs
# ** define our data processing function
def processData( pageFile ):
''' take the data from an html file and append to our csv file '''
f = open(pageFile, "r")
page = f.read()
f.close()
soup = BeautifulSoup(page)
# if you look at the HTML all the titles, dates,
# and authors are stored inside of <dt ...> tags
metaData = soup.find_all("dt")
# likewise the post data is stored
# under <dd ...>
postData = soup.find_all("dd")
# define where we will store info
titles = []
authors = []
times = []
posts = []
# now we iterate through the metaData and parse it
# into titles, authors, and dates
for html in metaData:
text = BeautifulSoup(str(html).strip()).get_text().encode("utf-8").replace("\n", "") # convert the html to text
titles.append(text.split("Title:")[1].split("Post by:")[0].strip()) # get Title:
authors.append(text.split("Post by:")[1].split(" on ")[0].strip()) # get Post by:
times.append(text.split(" on ")[1].strip()) # get date
# now we go through the actual post data and extract it
for post in postData:
posts.append(BeautifulSoup(str(post)).get_text().encode("utf-8").strip())
# now we write data to csv file
# ***csv files MUST be opened with the 'b' flag***
csvfile = open('silkroad.csv', 'ab')
writer = csv.writer(csvfile)
# iterate through and write all the data
for time, author, title, post in zip(times, authors, titles, posts):
writer.writerow([time, author, title, post])
# close file
csvfile.close()
# ** start our process of going through files
# the directory where we have all our HTML files
dir = "myDir"
# our csv file
csvFile = "silkroad.csv"
# insert the column titles to csv
csvfile = open(csvFile, 'wb')
writer = csv.writer(csvfile)
writer.writerow(["Time", "Author", "Title", "Post"])
csvfile.close()
# get a list of files in the directory
fileList = os.listdir(dir)
# define variables we need for status text
totalLen = len(fileList)
count = 1
# iterate through files and read all of them into the csv file
for htmlFile in fileList:
path = os.path.join(dir, htmlFile) # get the file path
processData(path) # process the data in the file
print "Processed '" + path + "'(" + str(count) + "/" + str(totalLen) + ")..." # display status
count = count + 1 # incriment counter

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to download and save all PDF from a dynamic web? - python

Related

urllib urlretrieve only saving final image in list of urls

Getting the: " FileNotFoundError: [Errno 2] No such file or directory: 'posted.txt '" even though I do have such file

Copy cell images from Smartsheet using Python

Downloading target link html to text files

BeautifulSoup - scraping a forum page

Categories

Resources