I am trying to create a recursive file structure, but when I try to create a file, it creates the first folder but no further folders.
import urllib2
from bs4 import BeautifulSoup
import shutil
import os
ext = [".html", ".jpeg", ".png", ".gif", ".jpg"]
def findLinks( url, newPath ):
resp=urllib2.urlopen(url) #open first link
if resp.getcode() == 200:
if "text/html" in resp.headers["content-type"]:
s = BeautifulSoup(resp.read(), "html.parser")
links = s.find_all( 'a' ) #put all a links into links list
for link in links:
f = link['href']
print f
newDir = newPath+f
if not os.path.isdir(newDir): #if doesn't already exist
if not newDir.endswith(tuple(ext)): #if nota file
os.makedirs(newDir) # create all directories
if newDir.endswith(".html"):
newFile = open(newDir, 'w+')
newFile.write("sample text")
newFile.close()
return links
findLinks('http://localhost/onlinecontent/Test', '/Test' )
Related
I am trying to download and save in a folder all the PDFs contained in some webs with dynamic elements i.e: https://www.bankinter.com/banca/nav/documentos-datos-fundamentales
Every PDF in this url have similar href. Here they are two of them:
"https://bancaonline.bankinter.com/publico/DocumentacionPrixGet?doc=workspace://SpacesStore/fb029023-dd29-47d5-8927-31021d834757;1.0&nameDoc=ISIN_ES0213679FW7_41-Bonos_EstructuradosGarantizad_19.16_es.pdf"
"https://bancaonline.bankinter.com/publico/DocumentacionPrixGet?doc=workspace://SpacesStore/852a7524-f21c-45e8-a8d9-1a75ce0f8286;1.1&nameDoc=20-Dep.Estruc.Cont.Financieros_18.1_es.pdf"
Here it is what I did for another web, this code is working as desired:
link = 'https://www.bankia.es/estaticos/documentosPRIIPS/json/jsonSimple.txt'
base = 'https://www.bankia.es/estaticos/documentosPRIIPS/{}'
dirf = os.environ['USERPROFILE'] + "\Documents\TFM\PdfFolder"
if not os.path.exists(dirf2):os.makedirs(dirf2)
os.chdir(dirf2)
res = requests.get(link,headers={"User-Agent":"Mozilla/5.0"})
for item in res.json():
if not 'nombre_de_fichero' in item: continue
link = base.format(item['nombre_de_fichero'])
filename_bankia = item['nombre_de_fichero'].split('.')[-2] + ".PDF"
with open(filename_bankia, 'wb') as f:
f.write(requests.get(link).content)
You have to make a post http requests with appropriate json parameter. Once you get the response, you have to parse two fields objectId and nombreFichero to use them to build right links to the pdf's. The following should work:
import os
import json
import requests
url = 'https://bancaonline.bankinter.com/publico/rs/documentacionPrix/list'
base = 'https://bancaonline.bankinter.com/publico/DocumentacionPrixGet?doc={}&nameDoc={}'
payload = {"cod_categoria": 2,"cod_familia": 3,"divisaDestino": None,"vencimiento": None,"edadActuarial": None}
dirf = os.environ['USERPROFILE'] + "\Desktop\PdfFolder"
if not os.path.exists(dirf):os.makedirs(dirf)
os.chdir(dirf)
r = requests.post(url,json=payload)
for item in r.json():
objectId = item['objectId']
nombreFichero = item['nombreFichero'].replace(" ","_")
filename = nombreFichero.split('.')[-2] + ".PDF"
link = base.format(objectId,nombreFichero)
with open(filename, 'wb') as f:
f.write(requests.get(link).content)
After executing the above script, wait a little for it to work as the site is real slow.
Hey guys so I am working on a twitter bot that takes posts from reddit and tweets them. My problem right now is when I run it I get the " FileNotFoundError: [Errno 2] No such file or directory: 'posted.txt '" error.
But the thing is as you will see in the image below and through my
code, 'posted.txt' does exist and it is in the same directory. So I am kind of stuck on what the
actual problem is. I have a nearly identical program to this where the
'Already_Tweeted' function works but the only difference is that this
one takes in image files as well (using BeautifulSoup). Could that be contributing to this
error?
This is not the complete project, only what is hopefully relevant
import praw
import tweepy
import time
import os
from bs4 import BeautifulSoup as bs
import requests
posted_reddit_ids = 'posted.txt'
def tweet_creator(subreddit_info):
'''Goes through posts on reddit and extracts a shortened link, title & ID'''
post_links = [] #list to store our links
post_titles = [] #list to store our titles
post_ids = [] #list to store our id's
post_imgs = []
print("[bot] extracting posts from sub-reddit")
for submission in subreddit_info.new(limit=5):
if not already_tweeted(submission.id):
post_titles.append(submission.title)
post_links.append(submission.shortlink)
post_ids.append(submission.id)
post_imgs = get_image(submission.url)
print(post_imgs)
else:
print("Already Tweeted")
return post_links, post_titles, post_ids, post_imgs
def already_tweeted(id):
'''reads through our .txt file and determines if tweet has already been posted'''
found = 0
with open(posted_reddit_ids, 'r') as f:
for line in f:
if id in line:
found = 1
break
return found
def main():
'''Main function'''
# If the tweet tracking file does not already exist, create it
if not os.path.exists(posted_reddit_ids):
with open(posted_reddit_ids, 'w'):
pass
if not os.path.exists(img_dir):
os.makedirs(img_dir)
subreddit = setup_connection_reddit(subreddit_to_watch)
post_links, post_titles, post_ids, post_imgs = tweet_creator(subreddit)
tweeter(post_links, post_titles, post_ids,post_imgs)
if __name__ == '__main__':
main()
To show the file and program are in the same directory
Edit:
It seems the error completely goes away when I remove the post_imgs = get_image(submission.url)
Here is my code for the get_image function, maybe this can help solve my problem
def get_image(img_url):
url = img_url
r = requests.get(url, headers = {'User-Agent' : 'reddit Twitter tool monitoring (by /u/RivianJourneyMan)'})
data = r.text
soup = bs(data, 'lxml')
image_tags = soup.findAll('img')
os.chdir(img_dir)
x = 0
mylist = []
for image in image_tags:
try:
url = image['src']
source = requests.get(url, stream = True)
if source.status_code == 200:
img_file = img_dir + str(x) + '.jpg'
with open(img_file, 'wb') as f:
f.write(requests.get(url).content)
mylist.append(img_file)
f.close()
x += 1
return img_file
except:
mylist.append(None)
print(mylist)
return mylist
What I am trying to accomplish here, is return a list of .jpg files in the get_image function and then copy that list over to the post_imgs in the tweet_creator function.
I use pathlib instead of module os, and it did not raise the FileNotFoundError:
#!/usr/bin/env python3.6
import time
import praw
import requests
import tweepy
from bs4 import BeautifulSoup as bs
from pathlib import Path
posted_reddit_ids = "posted.txt"
def tweet_creator(subreddit_info):
"""Goes through posts on reddit and extracts a shortened link, title & ID"""
post_links = [] # list to store our links
post_titles = [] # list to store our titles
post_ids = [] # list to store our id's
post_imgs = []
print("[bot] extracting posts from sub-reddit")
for submission in subreddit_info.new(limit=5):
if not already_tweeted(submission.id):
post_titles.append(submission.title)
post_links.append(submission.shortlink)
post_ids.append(submission.id)
post_imgs = get_image(submission.url)
print(post_imgs)
else:
print("Already Tweeted")
return post_links, post_titles, post_ids, post_imgs
def already_tweeted(id):
"""reads through our .txt file and determines if tweet has already been posted"""
return id in Path(posted_reddit_ids).read_text()
def main():
"""Main function"""
# If the tweet tracking file does not already exist, create it
Path(posted_reddit_ids).exists() or Path(posted_reddit_ids).write_text("")
Path(img_dir).exists() or Path(img_dir).mkdir(parents=True)
subreddit = setup_connection_reddit(subreddit_to_watch)
post_links, post_titles, post_ids, post_imgs = tweet_creator(subreddit)
tweeter(post_links, post_titles, post_ids, post_imgs)
if __name__ == "__main__":
main()
How can I improve downloading speed with urllib.request? I want to download image from web and It works well. But it takes too long downloading it. It took 42 seconds to excute donwload_album_art() func. What Can I do for that? Can I use mutiprocess or etc? h
import os
import shutil
import requests
from bs4 import BeautifulSoup
from urllib import request
URL = 'https://music.bugs.co.kr/chart/track/day/total'
PATH = os.getcwd() + '/static/images/'
# Scrapping html code
def get_html(target_url):
_html = ""
response = requests.get(target_url)
if response.status_code == 200:
_html = response.text
return _html
# parse image url and save in list
def get_image_url():
html = get_html(URL)
soup = BeautifulSoup(html, 'html.parser')
img_url = []
for image in soup.select('a.thumbnail > img'):
if image.has_attr('src'):
img_url.append(image.get('src'))
else:
continue
return img_url
# download album art in static/images directory
def download_album_arts():
images = get_image_url()
for i in range(0, 100):
url = images[i]
file_name = PATH + str(i + 1) + '.png'
request.urlretrieve(url, file_name)
# delete all album art
def delete_album_art():
path = os.getcwd() + '/static/images'
if os.path.exists(path):
shutil.rmtree(path)
os.mkdir(path)
else:
os.mkdir(path)
I've looked at the exifread documentation and it says that it's returned as a dictionary, but the problem is that it returns nothing except {}, I don't know if that means there is no meta data in the image, or I made a nooby mistake, well anyway I've spend a good chunk of time looking at my code and documentation, but still can't find the solution, any help would be appreciated :)
Code:
import exifread
import colorama
import urllib2
import urllib
import random
import time
import bs4
import sys
def get_images(target):
colorama.init()
print(colorama.Fore.LIGHTGREEN_EX + "[*] Retrieving Meta Data from Target's Page...")
req = urllib2.Request(target)
resp = urllib2.urlopen(req)
page = resp.read()
soup = bs4.BeautifulSoup(page, "html.parser")
for img in soup.find_all("img"):
src = img.get("src")
if "www" in src or "http" in src or "https" in src:
rand_num = random.random()
name = str(rand_num) + ".jpg"
urllib.urlretrieve(src, name)
f = open(name, "rb")
tags = exifread.process_file(f)
print (tags)
else:
s = target + src
rand_num = random.random()
name = str(rand_num) + ".jpg"
urllib.urlretrieve(s, name)
f = open(name, "rb")
tags = exifread.process_file(f)
print (tags)
return
def main():
target = raw_input("Enter the target: ")
print ("\n")
get_images(target)
time.sleep(5)
sys.exit()
if __name__ == "__main__":
main()
The problem is you were not passing a base url, you need to pass the host and then join that to the src unless you get an absolute url from the src attribute.
The following code demonstrates a working example, I used requests in place of urllib but the logic is the same:
import bs4
import sys
import os
import requests
from urlparse import urljoin
def get_images(target, base):
page = requests.get(target).content
soup = bs4.BeautifulSoup(page, "html.parser")
for img in soup.find_all("img", src=True):
src = img.get("src")
name = os.path.basename(src)
if not src.startswith(("www.","http:","https:")):
src = urljoin(base, src)
with open(name, "wb+") as f:
f.write(requests.get(src).content)
f.seek(0)
tags = exifread.process_file(f,"rb")
print (tags)
def main():
target ="http://www.exiv2.org/sample.html"
# need base to join to relative src
base = "http://www.exiv2.org/"
get_images(target, base)
if __name__ == "__main__":
main()
You will get the exif data for the one image on the page that has some:
A PIL example:
import bs4
import os
import requests
from urlparse import urljoin
import PIL.Image
def get_images(target, base):
page = requests.get(target).content
soup = bs4.BeautifulSoup(page, "html.parser")
for img in soup.find_all("img"):
src = img.get("src")
name = os.path.basename(src)
if not src.startswith(("www.","http:","https:")):
src = urljoin(base, src)
with open(name, "wb+") as f:
f.write(requests.get(src).content)
f.seek(0)
try:
img = PIL.Image.open(f)
exif_data = img._getexif()
print(exif_data)
except AttributeError as e:
print("No exif data for {}".format(name))
os.remove(name)
os.remove(name) will remove files that have no exif data, if you don't want that to happen then remove it.
I want to save images from url to special folder, for example 'my_images', but not to default(where my *.py file is). Is it possible to make it?
Because my code saves all images to folder with *.py file.
Here is my code:
import urllib.request
from bs4 import BeautifulSoup
import re
import os
BASE_URL = 'https://fachowiec.com/sklep/pl/products/index?Products_page=1&pageSize=15'
def get_domain(url):
domain = re.findall(r'https:\W\W\w+\.\w+', url)
return domain[0]
def get_html(url):
request = urllib.request.urlopen(url)
return request.read()
def get_img(html):
soup = BeautifulSoup(html)
img_box = []
imgs = soup.find_all('div', class_= 'pthumb')
for img in imgs:
img_box.append(get_domain(BASE_URL) + img.img['src'])
for img in img_box:
urllib.request.urlretrieve(img, os.path.basename(img))
def main():
get_img(get_html('https://fachowiec.com/sklep/pl/products/index?Products_page=1&pageSize=15'))
if __name__ == '__main__':
main()
def get_img(html):
soup = BeautifulSoup(html)
img_box = []
imgs = soup.find_all('div', class_= 'pthumb')
for img in imgs:
img_box.append(get_domain(BASE_URL) + img.img['src'])
my_path = '/home/<username>/Desktop' # use whatever path you like
for img in img_box:
urllib.request.urlretrieve(img, os.path.join(my_path, os.path.basename(img)))
You should add the pathname in second parameter of urllib.request.urlretrieve. Something like below:
urllib.request.urlretrieve(img, "PATH"+os.path.basename(img))
The second argument, if present, specifies the file location to copy to (if absent, the location will be a tempfile with a generated name).