I basically used requests and now I have links to webpages which have images on them. I want to extract just the images and make a file where I store them for later viewing. How would I do so?
import requests
import string
import random
def id_generator(size=6, chars=string.ascii_lowercase + string.digits):
return ''.join(random.choice(chars) for _ in range(size))
for i in range(50):
link = id_generator()
print("https://prnt.sc/" + link)
r = requests.get("https://prnt.sc/" + link)
response = requests.get("https://i.imgur.com/ExdKOOz.png")
file = open("sample_image.png", "wb")
file.write(response.content)
file.close()
Related
I am trying to download and save in a folder all the PDFs contained in some webs with dynamic elements i.e: https://www.bankinter.com/banca/nav/documentos-datos-fundamentales
Every PDF in this url have similar href. Here they are two of them:
"https://bancaonline.bankinter.com/publico/DocumentacionPrixGet?doc=workspace://SpacesStore/fb029023-dd29-47d5-8927-31021d834757;1.0&nameDoc=ISIN_ES0213679FW7_41-Bonos_EstructuradosGarantizad_19.16_es.pdf"
"https://bancaonline.bankinter.com/publico/DocumentacionPrixGet?doc=workspace://SpacesStore/852a7524-f21c-45e8-a8d9-1a75ce0f8286;1.1&nameDoc=20-Dep.Estruc.Cont.Financieros_18.1_es.pdf"
Here it is what I did for another web, this code is working as desired:
link = 'https://www.bankia.es/estaticos/documentosPRIIPS/json/jsonSimple.txt'
base = 'https://www.bankia.es/estaticos/documentosPRIIPS/{}'
dirf = os.environ['USERPROFILE'] + "\Documents\TFM\PdfFolder"
if not os.path.exists(dirf2):os.makedirs(dirf2)
os.chdir(dirf2)
res = requests.get(link,headers={"User-Agent":"Mozilla/5.0"})
for item in res.json():
if not 'nombre_de_fichero' in item: continue
link = base.format(item['nombre_de_fichero'])
filename_bankia = item['nombre_de_fichero'].split('.')[-2] + ".PDF"
with open(filename_bankia, 'wb') as f:
f.write(requests.get(link).content)
You have to make a post http requests with appropriate json parameter. Once you get the response, you have to parse two fields objectId and nombreFichero to use them to build right links to the pdf's. The following should work:
import os
import json
import requests
url = 'https://bancaonline.bankinter.com/publico/rs/documentacionPrix/list'
base = 'https://bancaonline.bankinter.com/publico/DocumentacionPrixGet?doc={}&nameDoc={}'
payload = {"cod_categoria": 2,"cod_familia": 3,"divisaDestino": None,"vencimiento": None,"edadActuarial": None}
dirf = os.environ['USERPROFILE'] + "\Desktop\PdfFolder"
if not os.path.exists(dirf):os.makedirs(dirf)
os.chdir(dirf)
r = requests.post(url,json=payload)
for item in r.json():
objectId = item['objectId']
nombreFichero = item['nombreFichero'].replace(" ","_")
filename = nombreFichero.split('.')[-2] + ".PDF"
link = base.format(objectId,nombreFichero)
with open(filename, 'wb') as f:
f.write(requests.get(link).content)
After executing the above script, wait a little for it to work as the site is real slow.
I would like to use a python script to find and replace some text in an InDesign file and then save it as pdf.
I managed to use python to open indesign and save it as pdf however I do not know how to search for text and replace it with a random string generated by the first part of the script.
Here is what I got so far:
import win32com.client
import random
import string
def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
return ''.join(random.choice(chars) for _ in range(size))
voucher=id_generator()
app = win32com.client.Dispatch('InDesign.Application.CC.2018')
myFile = r'C:\Users\some_file.indd'
myDocument = app.Open(myFile)
myPDFFile = r'C:\Users\some_file.pdf'
directory = os.path.dirname(myPDFFile)
idPDFType = 1952403524
# 1=[High Quality Print], 2=[PDF/X-1a:2001] etc..
myPDFPreset = app.PDFExportPresets.Item(1)
try:
if not os.path.exists(directory):
os.makedirs(directory)
if os.path.exists(directory):
myDocument.Export(idPDFType, myPDFFile, False, myPDFPreset)
except Exception as e:
print('Export to PDF failed: ' + str(e))
myDocument.Close()
You need to iterate over all of the TextFrames of the document and then search and replace the text with the ChangeText function.
Here is a snippet of what you can do:
voucher = id_generator()
searchText = 'test'
app = win32com.client.Dispatch('InDesign.Application.CC.2018')
app.scriptPreferences.userInteractionLevel = 1699640946
myFile = r'C:\Users\some_file.indd'
myDocument = app.Open(myFile)
myPage = myDocument.Pages.Item(1)
idNothing = 1851876449 #from enum idNothingEnum, see doc_reference
for it in myDocument.TextFrames:
if searchText in (it.Contents):
app.FindTextPreferences.FindWhat = searchText
app.ChangeTextPreferences.ChangeTo = voucher
it.ChangeText()
continue
app.FindTextPreferences.FindWhat = idNothing
app.ChangeTextPreferences.ChangeTo = idNothing
#and then save the changes as PDF...
Hey guys so I am working on a twitter bot that takes posts from reddit and tweets them. My problem right now is when I run it I get the " FileNotFoundError: [Errno 2] No such file or directory: 'posted.txt '" error.
But the thing is as you will see in the image below and through my
code, 'posted.txt' does exist and it is in the same directory. So I am kind of stuck on what the
actual problem is. I have a nearly identical program to this where the
'Already_Tweeted' function works but the only difference is that this
one takes in image files as well (using BeautifulSoup). Could that be contributing to this
error?
This is not the complete project, only what is hopefully relevant
import praw
import tweepy
import time
import os
from bs4 import BeautifulSoup as bs
import requests
posted_reddit_ids = 'posted.txt'
def tweet_creator(subreddit_info):
'''Goes through posts on reddit and extracts a shortened link, title & ID'''
post_links = [] #list to store our links
post_titles = [] #list to store our titles
post_ids = [] #list to store our id's
post_imgs = []
print("[bot] extracting posts from sub-reddit")
for submission in subreddit_info.new(limit=5):
if not already_tweeted(submission.id):
post_titles.append(submission.title)
post_links.append(submission.shortlink)
post_ids.append(submission.id)
post_imgs = get_image(submission.url)
print(post_imgs)
else:
print("Already Tweeted")
return post_links, post_titles, post_ids, post_imgs
def already_tweeted(id):
'''reads through our .txt file and determines if tweet has already been posted'''
found = 0
with open(posted_reddit_ids, 'r') as f:
for line in f:
if id in line:
found = 1
break
return found
def main():
'''Main function'''
# If the tweet tracking file does not already exist, create it
if not os.path.exists(posted_reddit_ids):
with open(posted_reddit_ids, 'w'):
pass
if not os.path.exists(img_dir):
os.makedirs(img_dir)
subreddit = setup_connection_reddit(subreddit_to_watch)
post_links, post_titles, post_ids, post_imgs = tweet_creator(subreddit)
tweeter(post_links, post_titles, post_ids,post_imgs)
if __name__ == '__main__':
main()
To show the file and program are in the same directory
Edit:
It seems the error completely goes away when I remove the post_imgs = get_image(submission.url)
Here is my code for the get_image function, maybe this can help solve my problem
def get_image(img_url):
url = img_url
r = requests.get(url, headers = {'User-Agent' : 'reddit Twitter tool monitoring (by /u/RivianJourneyMan)'})
data = r.text
soup = bs(data, 'lxml')
image_tags = soup.findAll('img')
os.chdir(img_dir)
x = 0
mylist = []
for image in image_tags:
try:
url = image['src']
source = requests.get(url, stream = True)
if source.status_code == 200:
img_file = img_dir + str(x) + '.jpg'
with open(img_file, 'wb') as f:
f.write(requests.get(url).content)
mylist.append(img_file)
f.close()
x += 1
return img_file
except:
mylist.append(None)
print(mylist)
return mylist
What I am trying to accomplish here, is return a list of .jpg files in the get_image function and then copy that list over to the post_imgs in the tweet_creator function.
I use pathlib instead of module os, and it did not raise the FileNotFoundError:
#!/usr/bin/env python3.6
import time
import praw
import requests
import tweepy
from bs4 import BeautifulSoup as bs
from pathlib import Path
posted_reddit_ids = "posted.txt"
def tweet_creator(subreddit_info):
"""Goes through posts on reddit and extracts a shortened link, title & ID"""
post_links = [] # list to store our links
post_titles = [] # list to store our titles
post_ids = [] # list to store our id's
post_imgs = []
print("[bot] extracting posts from sub-reddit")
for submission in subreddit_info.new(limit=5):
if not already_tweeted(submission.id):
post_titles.append(submission.title)
post_links.append(submission.shortlink)
post_ids.append(submission.id)
post_imgs = get_image(submission.url)
print(post_imgs)
else:
print("Already Tweeted")
return post_links, post_titles, post_ids, post_imgs
def already_tweeted(id):
"""reads through our .txt file and determines if tweet has already been posted"""
return id in Path(posted_reddit_ids).read_text()
def main():
"""Main function"""
# If the tweet tracking file does not already exist, create it
Path(posted_reddit_ids).exists() or Path(posted_reddit_ids).write_text("")
Path(img_dir).exists() or Path(img_dir).mkdir(parents=True)
subreddit = setup_connection_reddit(subreddit_to_watch)
post_links, post_titles, post_ids, post_imgs = tweet_creator(subreddit)
tweeter(post_links, post_titles, post_ids, post_imgs)
if __name__ == "__main__":
main()
How can I improve downloading speed with urllib.request? I want to download image from web and It works well. But it takes too long downloading it. It took 42 seconds to excute donwload_album_art() func. What Can I do for that? Can I use mutiprocess or etc? h
import os
import shutil
import requests
from bs4 import BeautifulSoup
from urllib import request
URL = 'https://music.bugs.co.kr/chart/track/day/total'
PATH = os.getcwd() + '/static/images/'
# Scrapping html code
def get_html(target_url):
_html = ""
response = requests.get(target_url)
if response.status_code == 200:
_html = response.text
return _html
# parse image url and save in list
def get_image_url():
html = get_html(URL)
soup = BeautifulSoup(html, 'html.parser')
img_url = []
for image in soup.select('a.thumbnail > img'):
if image.has_attr('src'):
img_url.append(image.get('src'))
else:
continue
return img_url
# download album art in static/images directory
def download_album_arts():
images = get_image_url()
for i in range(0, 100):
url = images[i]
file_name = PATH + str(i + 1) + '.png'
request.urlretrieve(url, file_name)
# delete all album art
def delete_album_art():
path = os.getcwd() + '/static/images'
if os.path.exists(path):
shutil.rmtree(path)
os.mkdir(path)
else:
os.mkdir(path)
If you were to save an Image using it's URL how would you do it ?
Also how do I give the Image a unique file name while saving it.
response = urllib.urlopen(image_url)
file_name = ''.join(random.choice(string.ascii_uppercase + string.digits) for x in range(10))
f = open('/media/images/temp/'+file_name, "wb")
f.write(response.read())
f.close()
It throws no error nor saves the file... I'm new to this I have no clue what is going wrong : |
import urllib
import string
import random
import os
filename_charset = string.ascii_letters + string.digits
filename_length = 10
file_save_dir = '/home/user/download/'
filename = ''.join(random.choice(filename_charset)
for s in range(filename_length))
urllib.urlretrieve ("http://www.example.com/image.png",
os.path.join(file_save_dir, filename + '.png'))