Download all files with extension from a page - python

I am trying to download all netcdf (.nc) files here:
https://www.ncei.noaa.gov/data/avhrr-land-normalized-difference-vegetation-index/access/2000/
import urllib3
from bs4 import BeautifulSoup
site = urllib3.PoolManager()
base_url = 'https://www.ncei.noaa.gov//data//avhrr-land-normalized-difference-vegetation-index//access//'
html = site.request('GET', base_url + '//' + '2000')
soup = BeautifulSoup(html.data, "lxml")
list_urls = soup.find_all('.nc')
However, list_urls is empty after running this code. How can I fix it?

Here is what I did soup.find_all(text=lambda t: ".nc" in t) and working fine with a progress bar as well :)
import sys
import requests
import urllib3
import humanize
from bs4 import BeautifulSoup
site = urllib3.PoolManager()
base_url = 'https://www.ncei.noaa.gov//data//avhrr-land-normalized-difference-vegetation-index//access//'
html = site.request('GET', base_url + '//' + '2000')
soup = BeautifulSoup(html.data, "lxml")
link_urls = soup.find_all(text=lambda t: ".nc" in t)
for link in link_urls:
download_link = "{}2000/{}".format(base_url, link)
r = requests.get(download_link, stream=True)
total_length = r.headers.get('content-length')
print("\nDownloading: {}\nTotalSize: {}".format(download_link, humanize.naturalsize(total_length)))
with open(link, "wb") as f:
print("Downloading %s" % link)
if total_length is None: # no content length header
f.write(r.content)
else:
dl = 0
total_length = int(total_length)
for data in r.iter_content(chunk_size=4096):
dl += len(data)
f.write(data)
done = int(50 * dl / total_length)
sys.stdout.write("\r[%s%s]" % ('=' * done, ' ' * (50 - done)))
sys.stdout.flush()

Related

Downloading Zip file from a webpage

I am trying to download the zip, which we can also download by clicking on "SCARICA I DATI CSV" on this webpage. I want to do this for 7000+ Italian municipalities by using beautiful soup.
Right now, I have the following code for one city/municipality:
city_name = "vandoies-vintl"
prov_name = "bz"
r = urllib.request.urlopen('http://storico.openbilanci.it/bilanci/' + city_name + "-comune-" + prov_name).read()
soup = BeautifulSoup(r, 'lxml')
# this is where the code breaks. because the HTML body does not have any mention of "csv" whatsoever, which is weird.
csv = soup.find_all('a', attrs={'class':'pull-right csv'})
csvlink = csv[0]['href']
urllib.request.urlretrieve("http://storico.openbilanci.it" + csvlink, city_name+".zip")
I can't find any mention of csv when checking using print(soup). Could someone please help? Thanks!
The following code works.
import pandas as pd
import numpy as np
import time
from bs4 import BeautifulSoup
import urllib.request
import re
import os
import urllib
import zipfile
import re
output_path = r"/Users/aartimalik/Dropbox/delphine-miscellaneous/italy/test"
munis = [("monale", "at"), ("portacomaro", "at")]
munis = pd.DataFrame(munis)
munis.columns = ['municipality_clean','prov_abb']
def remove_paren(string):
return re.sub(r'\(.*\)', '', str(string))
munis['municipality_clean']= munis['municipality_clean'].apply(lambda x: remove_paren(x))
munis['municipality_clean'] = munis['municipality_clean'].str.strip()
munis = munis.replace(' ', '-', regex=True)
munis = munis.apply(lambda x: x.str.lower())
for i in range(0,len(munis)):
city_name = munis.iloc[i]['municipality_clean']
prov_name = munis.iloc[i]['prov_abb']
try:
r = urllib.request.urlopen('http://storico.openbilanci.it/bilanci/' + city_name + "-comune-" + prov_name).read()
soup = BeautifulSoup(r, 'lxml')
csv = soup.find_all('a', attrs={'class':'pull-right csv'})
try:
csvlink = csv[0]['href']
urllib.request.urlretrieve("http://storico.openbilanci.it" + csvlink, city_name+".zip")
#print('Downloaded and extracted zip for ' + city_name + ', ' + prov_name)
print(str(i) + ". " + city_name+": success")
scrapesuccess = scrapesuccess.append(munis.iloc[i])
newfolder= output_path + "/" + city_name.capitalize()
if not os.path.exists(newfolder):
os.makedirs(newfolder)
zip_ref = zipfile.ZipFile(output_path + "/" + city_name + ".zip", 'r')
zip_ref.extractall(newfolder)
zip_ref.close()
except:
scrapefail = scrapefail.append(munis.iloc[i])
print(str(i) + ". " + city_name+": fail")
except:
scrapefail = scrapefail.append(munis.iloc[i])
print(str(i) + ". " + city_name+": fail")
Heres an example of downloading the zip in memory and writing a city directory with all the csv files.
import urllib.request as request
from io import StringIO
from pathlib import Path
from zipfile import ZipFile
import pandas as pd
from bs4 import BeautifulSoup
class Scraper:
def __init__(self, **kwargs):
self.url_root = "http://storico.openbilanci.it"
self.city_name = kwargs.get("city_name")
self.prov_name = kwargs.get("prov_name")
def main(self) -> None:
file_link = self.get_link()
zipped_file = self.download_file(file_link)
unzipped_files_mapping = self.unzip_file(zipped_file)
self.write_files(unzipped_files_mapping)
def get_link(self) -> str:
url = f"{self.url_root}/bilanci/{self.city_name}-comune-{self.prov_name}"
response = request.urlopen(url).read()
soup = BeautifulSoup(response, "lxml")
return soup.find_all("a", attrs={"class": "pull-right csv"})[0]["href"]
def download_file(self, zip_link: str) -> str:
url = f"{self.url_root}{zip_link}"
return request.urlretrieve(url)[0]
#staticmethod
def unzip_file(file_handle: str) -> dict:
zip_file_object = ZipFile(file_handle, "r")
files = zip_file_object.namelist()
return {
file: pd.read_csv(StringIO(zip_file_object.open(file).read().decode("utf-8")), sep=";")
for file in files
}
def write_files(self, file_mapping: dict) -> None:
for file, df in file_mapping.items():
file_path, file_name = file.rsplit("/", 1)
path = Path(f"/path/to/files/{self.city_name}/{file_path}")
path.mkdir(parents=True, exist_ok=True)
df.to_csv(f"{path}/{file_name}")
city_name = "vandoies-vintl"
prov_name = "bz"
Scraper(city_name=city_name, prov_name=prov_name).main()

Downloading multiple files from a URL Directory with a similar name

I am trying to download multiple .csv files from an url directory with similar names (AB_daily.csv, BC_daily.csv, etc.). However, each file is stored in different folders in the directory. I know there is a way to use a loop to extract the files, but I can't figure out how to do it with Beautiful Soup or glob. Do you have any suggestions? I've also used pandas.read_csv() to look for shortcuts as I'm just trying to concatenate the files together later. Thank you.
URL Directory: https://dd.weather.gc.ca/hydrometric/csv/
import os
import requests
import urllib.request
from bs4 import BeautifulSoup
def main(url):
with requests.Session() as req:
r = req.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
target = [f"{url[:20]}{item['href']}" for item in soup.select(
"a[href$='AB_daily_hydrometric.csv']")
for x in target:
print(f"Downloading {x}")
r = req.get(x)
name = x.rsplit("/", 1)[-1]
with open(name, 'wb') as f:
f.write(r.content)
main('https://dd.weather.gc.ca/hydrometric/csv')
# For this specific task this will work:
import os
import requests
import urllib.request
from bs4 import BeautifulSoup
csv_links = []
links = ["AB/", "BC/", "MB/", "NB/", "NL/", "NS/", "NT/", "NU/" "ON/", " PE/", "QC/", "SK/", "YT/"]
def main(url):
with requests.Session() as req:
r = req.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
for item in links:
r_daily = BeautifulSoup(req.get(url + item + "daily/").content, 'html.parser')
r_hourly = BeautifulSoup(req.get(url + item + "hourly/").content, 'html.parser')
for item_d in r_daily.find_all('a'):
if ".csv" in item_d.get('href'):
csv_links.append(url + item + "daily/" +item_d.get('href'))
for item_h in r_hourly.find_all('a'):
if ".csv" in item_h.get('href'):
csv_links.append(url + item + "hourly/" + item_h.get('href'))
for x in csv_links:
print(f"Downloading {x}")
r = req.get(x)
name = x.rsplit("/", 1)[-1]
with open(name, 'wb') as f:
f.write(r.content)
main('https://dd.weather.gc.ca/hydrometric/csv/')

How can I convert list of crawled data to excel column?

import openpyxl
xl_file = openpyxl.Workbook()
xl_sheet =xl_file.active
from urllib.request import urlopen
from bs4 import BeautifulSoup
stockItem = '028300'
url = 'http://finance.naver.com/item/sise_day.nhn?code='+ stockItem
html = urlopen(url)
source = BeautifulSoup(html.read(), "html.parser")
maxPage=source.find_all("table",align="center")
mp = maxPage[0].find_all("td",class_="pgRR")
mpNum = int(mp[0].a.get('href')[-3:])
for page in range(1, 10):
print (str(page) )
url = 'http://finance.naver.com/item/sise_day.nhn?code=' + stockItem +'&page='+ str(page)
html = urlopen(url)
source = BeautifulSoup(html.read(), "html.parser")
srlists=source.find_all("tr")
isCheckNone = None
if((page % 1) == 0):
time.sleep(0)
for i in range(1,len(srlists)-1):
if(srlists[i].span != isCheckNone):
srlists[i].td.text
data1 = srlists[i].find_all("td",align="center")
data2 = srlists[i].find_all("td",class_="num")
print(srlists[i].find_all("td",align="center")[0].text, srlists[i].find_all("td",class_="num")[0].text )
for item in data1:
xl_sheet.append([item.get_text()])
This is what I've done for crawling stock data from the site.
I've successfully crawled the data of stock.
However, I couldn't save the data into excel file.
I've tried it, but it only showed the date data without the price data.
How could I convert results to excel file?
There were 2 things you misssed,
1) Mistake in importing packages
2) Did not append data2 in excel which contains prices
Here is the final code which will give your desired output. Just put your folder location for saving excel file.
import time
from openpyxl import Workbook #
xl_file = Workbook()
xl_sheet =xl_file.active
from urllib.request import urlopen
from bs4 import BeautifulSoup
i = 0
stockItem = '028300'
url = 'http://finance.naver.com/item/sise_day.nhn?code='+ stockItem
html = urlopen(url)
source = BeautifulSoup(html.read(), "html.parser")
maxPage=source.find_all("table",align="center")
mp = maxPage[0].find_all("td",class_="pgRR")
mpNum = int(mp[0].a.get('href')[-3:])
for page in range(1, 10):
print (str(page) )
url = 'http://finance.naver.com/item/sise_day.nhn?code=' + stockItem +'&page='+ str(page)
html = urlopen(url)
source = BeautifulSoup(html.read(), "html.parser")
srlists=source.find_all("tr")
isCheckNone = None
if((page % 1) == 0):
time.sleep(0)
for i in range(1,len(srlists)-1):
if(srlists[i].span != isCheckNone):
srlists[i].td.text
data1 = srlists[i].find_all("td",align="center")
data2 = srlists[i].find_all("td",class_="num")
#print(srlists[i].find_all("td",align="center")[0].text, srlists[i].find_all("td",class_="num")[0].text )
for item1,item2 in zip(data1,data2):
xl_sheet.append([item.get_text(),item2.get_text()])
print(xl_sheet)
xl_file.save(r'C:\Users\Asus\Desktop\vi.xlsx')
Suggestion : You can use Yahoofinance package for python to download stock data easily.
you can follow this link >> https://pypi.org/project/yfinance/

scrape multiple page and put result in one CSV

i am a newbie in coding programs, and i start with python. i use it for scraping data from website, online shops to be specific. i want to scrape every page of the result page (with pagination) and put the result url in one csv
this is what i've been trying
import selenium
import bs4
from selenium import webdriver
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
myurl = 'https://www.tokopedia.com/p/rumah-tangga/alat-pertukangan/obeng?keyword=obeng&page='
chrome_path = '/home/yoga/Downloads/chromedriver'
driver = webdriver.Chrome(chrome_path)
#opening webpage
for number in range(10):
buka = driver.get(myurl + str(number))
page_source = driver.page_source
soup_this = soup(page_source, "html.parser")
product_links = soup_this.findAll("div",{"class":"product-summary"})
for number2 in range(10):
filename = "tokopedia" + str(number2) + ".csv"
f = open(filename, "w")
headers = "Link" + "\n"
f.write(headers)
for product in product_links:
barang = product.a["ng-href"]
print(barang + "\n")
f.write(barang + "\n")
f.close()
driver.close()
the result that i got in the csv is only for one page. can you guys help me?
import selenium
import bs4
from selenium import webdriver
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
myurl = 'https://www.tokopedia.com/p/rumah-tangga/alat-pertukangan/obeng?keyword=obeng&page='
chrome_path = '/home/yoga/Downloads/chromedriver'
driver = webdriver.Chrome(chrome_path)
filename = "tokopedia.csv"
f = open(filename, "w")
#opening webpage
for number in range(10):
buka = driver.get(myurl + str(number))
page_source = driver.page_source
soup_this = soup(page_source, "html.parser")
product_links = soup_this.findAll("div",{"class":"product-summary"})
headers = "Link" + "\n"
f.write(headers)
for product in product_links:
barang = product.a["ng-href"]
print(barang + "\n")
f.write(barang + "\n")
f.close()
driver.close()

How can I improve downloading speed with python urllib.request

How can I improve downloading speed with urllib.request? I want to download image from web and It works well. But it takes too long downloading it. It took 42 seconds to excute donwload_album_art() func. What Can I do for that? Can I use mutiprocess or etc? h
import os
import shutil
import requests
from bs4 import BeautifulSoup
from urllib import request
URL = 'https://music.bugs.co.kr/chart/track/day/total'
PATH = os.getcwd() + '/static/images/'
# Scrapping html code
def get_html(target_url):
_html = ""
response = requests.get(target_url)
if response.status_code == 200:
_html = response.text
return _html
# parse image url and save in list
def get_image_url():
html = get_html(URL)
soup = BeautifulSoup(html, 'html.parser')
img_url = []
for image in soup.select('a.thumbnail > img'):
if image.has_attr('src'):
img_url.append(image.get('src'))
else:
continue
return img_url
# download album art in static/images directory
def download_album_arts():
images = get_image_url()
for i in range(0, 100):
url = images[i]
file_name = PATH + str(i + 1) + '.png'
request.urlretrieve(url, file_name)
# delete all album art
def delete_album_art():
path = os.getcwd() + '/static/images'
if os.path.exists(path):
shutil.rmtree(path)
os.mkdir(path)
else:
os.mkdir(path)

Categories

Resources