Web scraping - saving files to nested folders - python

I am downloading pdf files from different URLs using a built-in API.
My end result should be to download files from each unique link (identified as links in the code below) to unique folders (folder_location in the code) on the desktop.
I am quite puzzled by how I should arrange codes to do this as I am still a novice. So far I have tried the following.
import os
import requests
from glob import glob
import time
from urllib.parse import urljoin
from bs4 import BeautifulSoup
links = ["P167897", "P173997", "P166309"]
folder_location = "/pdf/"
for link, folder in zip(links, folder_location):
time.sleep(10)
end_point = f"https://search.worldbank.org/api/v2/wds?" \
f"format=json&includepublicdocs=1&" \
f"fl=docna,lang,docty,repnb,docdt,doc_authr,available_in&" \
f"os=0&rows=20&proid={link}&apilang=en"
documents = requests.get(end_point).json()["documents"]
for document_data in documents.values():
try:
pdf_url = document_data["pdfurl"]
filename = os.path.join(folder,pdf_url.split('/')[-1])
with open(filename, 'wb') as f:
f.write(requests.get(pdf_url).content)
EDIT: To clarify, the objects in links are id based on which links to the pdf files are to be identified from the API.

You could try using the pathlib module.
Here's how:
import os
import time
from pathlib import Path
import requests
links = ["P167897", "P173997", "P166309"]
for link in links:
end_point = f"https://search.worldbank.org/api/v2/wds?" \
f"format=json&includepublicdocs=1&" \
f"fl=docna,lang,docty,repnb,docdt,doc_authr,available_in&" \
f"os=0&rows=20&proid={link}&apilang=en"
documents = requests.get(end_point).json()["documents"]
for document_data in documents.values():
try:
pdf_url = document_data["pdfurl"]
file_path = Path(f"pdf/{link}/{pdf_url.rsplit('/')[-1]}")
file_path.parent.mkdir(parents=True, exist_ok=True)
with file_path.open("wb") as f:
f.write(requests.get(pdf_url).content)
time.sleep(10)
except KeyError:
continue
This outputs files to:
pdf/
└── P167897
├── Official-Documents-First-Restatement-to-the-Disbursement-Letter-for-Grant-D6810-SL-and-for-Additional-Financing-Grant-TF0B4694.pdf
└── Official-Documents-Grant-Agreement-for-Additional-Financing-Grant-TF0B4694.pdf
...

Related

Scraping from web site into HDFS

I'm trying to scrap data from website into HDFS, at first it was working well the scraping, and then I added the line of storing data into HDFS it's not working:
import requests
from pathlib import Path
import os
from datetime import date
from hdfs import InsecureClient
date= date.today()
date
def downloadFile(link, destfolder):
r = requests.get(link,stream=True)
filename="datanew1"+ str(date)+".xls"
downloaded_file = open(os.path.join(destfolder, filename), 'wb')
client= InsecureClient('http://hdfs-namenode.default.svc.cluster.local:50070', user='hdfs')
with client.download('/data/test.csv')
for chunk in r.iter_content(chunk_size=256):
if chunk:
downloaded_file.write(chunk)
link="https://api.worldbank.org/v2/fr/indicator/FP.CPI.TOTL.ZG?downloadformat=excel"
Path('http://hdfs-namenode.default.svc.cluster.local:50070/data').mkdir(parents=True, exist_ok=True)
downloadFile(link, 'http://hdfs-namenode.default.svc.cluster.local:50070/data')
There is no error in the code, just I can't found the data scraped!

File Not Found Error while Downloading Image files

I am using Windows 8.1, so I have been web scraping a lot recently and have been very successful in finding out some errors as well, but now I am stuck in downloading the files as they will not download and giving me a
FileNotFoundError.
I have removed all the unknown characters from the name files but still, get this error. any help.
I have also made the names lowercase just in case. The error happens when I download the 22nd item, other items download fine before the 22nd one .
My Code and also the Excel file For reference:
import time
import pandas as pd
import requests
Final1 = pd.read_excel("Sneakers.xlsx")
Final1.index+=1
a = Final1.index.tolist()
Images = Final1["Images"].tolist()
Name = Final1["Name"].str.lower().tolist()
Brand = Final1["Brand"].str.lower().tolist()
s = requests.Session()
for i,n,b,l in zip(a,Name,Brand,Images):
r = s.get(l).content
with open("Images//" + f"{i}-{n}-{b}.jpg","wb") as f:
f.write(r)
Excel File (Google Drive) : Excel File
It seems like you don't have Images folder in your path.
It's better way to use os.path.join() function for joining path in python.
Try Below:
import os
import time
import pandas as pd
import requests
Final1 = pd.read_excel("Sneakers.xlsx")
Final1.index+=1
a = Final1.index.tolist()
Images = Final1["Images"].tolist()
Name = Final1["Name"].str.lower().tolist()
Brand = Final1["Brand"].str.lower().tolist()
# Added
if not os.path.exists("Images"):
os.mkdir("Images")
s = requests.Session()
for i,n,b,l in zip(a,Name,Brand,Images):
r = s.get(l).content
# with open("Images//" + f"{i}-{n}-{b}.jpg","wb") as f:
with open(os.path.join("Images", f"{i}-{n}-{b}.jpg"),"wb") as f:
f.write(r)

Web Scraping FileNotFoundError When Saving to PDF

I copy some Python code in order to download data from a website. Here is my specific website:
https://www.codot.gov/business/bidding/bid-tab-archives/bid-tabs-2017-1
Here is the code which I copied:
import requests
from bs4 import BeautifulSoup
def _getUrls_(res):
hrefs = []
soup = BeautifulSoup(res.text, 'lxml')
main_content = soup.find('div',{'id' : 'content-core'})
table = main_content.find("table")
for a in table.findAll('a', href=True):
hrefs.append(a['href'])
return(hrefs)
bidurl = 'https://www.codot.gov/business/bidding/bid-tab-archives/bid-tabs-2017-1'
r = requests.get(bidurl)
hrefs = _getUrls_(r)
def _getPdfs_(hrefs, basedir):
for i in range(len(hrefs)):
print(hrefs[i])
respdf = requests.get(hrefs[i])
pdffile = basedir + "/pdf_dot/" + hrefs[i].split("/")[-1] + ".pdf"
try:
with open(pdffile, 'wb') as p:
p.write(respdf.content)
p.close()
except FileNotFoundError:
print("No PDF produced")
basedir= "/Users/ABC/Desktop"
_getPdfs_(hrefs, basedir)
The code runs successfully, but it did not download anything at all, even though there is no Filenotfounderror obviously.
I tried the following two URLs:
https://www.codot.gov/business/bidding/bid-tab-archives/bid-tabs-2017/aqc-088a-035-20360
https://www.codot.gov/business/bidding/bid-tab-archives/bid-tabs-2017/aqc-r100-258-21125
However both of these URLs return >>> No PDF produced.
The thing is that the code worked and downloaded successfully for other people, but not me.
Your code works I just tested. You need to make sure the basedir exists, you want to add this to your code:
if not os.path.exists(basedir):
os.makedirs(basedir)
I used this exact (indented) code but replaced the basedir with my own dir and it worked only after I made sure that the path actually exists. This code does not create the folder in case it does not exist.
As others have pointed out, you need to create basedir beforehand. The user running the script may not have the directory created. Make sure you insert this code at the beginning of the script, before the main logic.
Additionally, hardcoding the base directory might not be a good idea when transferring the script to different systems. It would be preferable to use the users %USERPROFILE% enviorment variable:
from os import envioron
basedir= join(environ["USERPROFILE"], "Desktop", "pdf_dot")
Which would be the same as C:\Users\blah\Desktop\pdf_dot.
However, the above enivorment variable only works for Windows. If you want it to work Linux, you will have to use os.environ["HOME"] instead.
If you need to transfer between both systems, then you can use os.name:
from os import name
from os import environ
# Windows
if name == 'nt':
basedir= join(environ["USERPROFILE"], "Desktop", "pdf_dot")
# Linux
elif name == 'posix':
basedir = join(environ["HOME"], "Desktop", "pdf_dot")
You don't need to specify the directory or create any folder manually. All you need do is run the following script. When the execution is done, you should get a folder named pdf_dot in your desktop containing the pdf files you wish to grab.
import requests
from bs4 import BeautifulSoup
import os
URL = 'https://www.codot.gov/business/bidding/bid-tab-archives/bid-tabs-2017-1'
dirf = os.environ['USERPROFILE'] + '\Desktop\pdf_dot'
if not os.path.exists(dirf):os.makedirs(dirf)
os.chdir(dirf)
res = requests.get(URL)
soup = BeautifulSoup(res.text, 'lxml')
pdflinks = [itemlink['href'] for itemlink in soup.find_all("a",{"data-linktype":"internal"}) if "reject" not in itemlink['href']]
for pdflink in pdflinks:
filename = f'{pdflink.split("/")[-1]}{".pdf"}'
with open(filename, 'wb') as f:
f.write(requests.get(pdflink).content)

How to download all images from url with python 2.7 - Problem

I try to download all images from 'https://www.nytimes.com/section/todayspaper' with this code:
import requests
from io import open as iopen
from urlparse import urlsplit
file_url= 'https://www.nytimes.com/section/todayspaper'
def requests_image(file_url):
suffix_list = ['jpg', 'gif', 'png', 'tif', 'svg',]
file_name = urlsplit(file_url)[2].split('/')[-1]
file_suffix = file_name.split('.')[1]
i = requests.get(file_url)
if file_suffix in suffix_list and i.status_code == requests.codes.ok:
with iopen(file_name, 'wb') as file:
file.write(i.content)
else:
return False
no error occur when run it:
>>>
>>>
but i don't know where the images downloaded in my PC?
i checked download folder and they aren't there.
If you want to download all images in the page you should:
Download web page
Find all image tags (<img>)
Scan all image tags and find src attribute content
Download all files from founded links
import os
import hashlib
import requests
from bs4 import BeautifulSoup
page_url = 'https://www.nytimes.com/section/todayspaper'
# Download page html
page_data = requests.get(page_url).text
# Find all links in page
images_urls = [
image.attrs.get('src')
for image in BeautifulSoup(page_data, 'lxml').find_all('img')
]
# Clean empty links (<img src="" /> <img> etc)
images_urls = [
image_url
for image_url in images_urls
if image_url and len(image_url)>0
]
# Download files
def download_image(source_url, dest_dir):
# TODO: add filename extension
image_name = hashlib.md5(source_url.encode()).hexdigest()
with open(os.path.join(dest_dir, image_name), 'wb') as f:
image_data = requests.get(source_url).content
f.write(image_data)
for image_url in images_urls:
download_image(image_url, './tmp')

Unable to save downloaded images into a folder on the desktop using python

I have made a scraper which is at this moment parsing image links and saving downloaded images into python directory by default. The only thing i wanna do now is choose a folder on the desktop to save those images within but can't. Here is what I'm up to:
import requests
import os.path
import urllib.request
from lxml import html
def Startpoint():
url = "https://www.aliexpress.com/"
response = requests.get(url)
tree = html.fromstring(response.text)
titles = tree.xpath('//div[#class="item-inner"]')
for title in titles:
Pics="https:" + title.xpath('.//span[#class="pic"]//img/#src')[0]
endpoint(Pics)
def endpoint(images):
sdir = (r'C:\Users\ar\Desktop\mth')
testfile = urllib.request.URLopener()
xx = testfile.retrieve(images, images.split('/')[-1])
filename=os.path.join(sdir,xx)
print(filename)
Startpoint()
Upon execution the above code throws an error showing: "join() argument must be str or bytes, not 'tuple'"
you can download images with urllib of python. You can see the official documentation of python here urllib documentation for python 2.7 . If you want to use python 3 then follow this documentation urllib for python 3
You could use urllib.request, BytesIO from io and PIL Image.
(if you have a direct url to the image)
from PIL import Image
from io import BytesIO
import urllib.request
def download_image(url):
req = urllib.request.Request(url)
response = urllib.request.urlopen(req)
content = response.read()
img = Image.open(BytesIO(content))
img.filename = url
return img
The images are dynamic now. So, I thought to update this post:
import os
from selenium import webdriver
import urllib.request
from lxml.html import fromstring
url = "https://www.aliexpress.com/"
def get_data(link):
driver.get(link)
tree = fromstring(driver.page_source)
for title in tree.xpath('//li[#class="item"]'):
pics = "https:" + title.xpath('.//*[contains(#class,"img-wrapper")]//img/#src')[0]
os.chdir(r"C:\Users\WCS\Desktop\test")
urllib.request.urlretrieve(pics, pics.split('/')[-1])
if __name__ == '__main__':
driver = webdriver.Chrome()
get_data(url)
driver.quit()
This is the code to download the html file from the web
import random
import urllib.request
def download(url):
name = random.randrange(1, 1000)
#this is the random function to give the name to the file
full_name = str(name) + ".html" #compatible data type
urllib.request.urlretrieve(url,full_name) #main function
download("any url")
This is the code for downloading any html file from the internet just you have to provide the link in the function.
As in your case you have told that you have retrieved the images links from the web page So you can change the extension from ".html" to compatible type, but the problem is that the image can be of different extension may be ".jpg" , ".png" etc.
So what you can do is you can match the ending of the link using if else with string matching and then assign the extension in the end.
Here is the example for the illustration
import random
import urllib.request
if(link extension is ".png"): #pseudo code
def download(url):
name = random.randrange(1, 1000)
#this is the random function to give the name to the file
full_name = str(name) + ".png" #compatible extension with .png
urllib.request.urlretrieve(url,full_name) #main function
download("any url")
else if (link extension is ".jpg"): #pseudo code
def download(url):
name = random.randrange(1, 1000)
#this is the random function to give the name to the file
full_name = str(name) + ".jpg" #compatible extension with .jpg
urllib.request.urlretrieve(url,full_name) #main function
download("any url")
You can use multiple if else for the various type of the extension.
If it helps for your situation have a Thumbs up buddy.

Categories

Resources