So I need to download the images of every coin on the list on CoinGecko, so I wrote the following code:
import requests
from bs4 import BeautifulSoup
from os.path import basename
def getdata(url):
r = requests.get(url)
return r.text
htmldata = getdata("https://www.coingecko.com/en")
soup = BeautifulSoup(htmldata, 'html.parser')
for item1 in soup.select('.coin-icon img'):
link = item1.get('data-src').replace('thumb', 'thumb_2x')
with open(basename(link), "wb") as f:
f.write(requests.get(link).content)
However, I need to save the images with their names being the same as the ticker of the coin of that list from CoinGecko (rename bitcoin.png?1547033579 to BTC.png, ethereum.png?1595348880 to ETH.png, and so forth). There are over 7000 images that need to be renamed, and many of them have quite unique names, so slicing does not work here.
What is the way to do it?
I was browsing the html file and I found that the tag you are looking at has an alt parameter that has the ticker on the end of the string.
<div class="coin-icon mr-2 center flex-column">
<img class="" alt="bitcoin (BTC)" data-src="https://assets.coingecko.com/coins/images/1/thumb/bitcoin.png?1547033579" data-srcset="https://assets.coingecko.com/coins/images/1/thumb_2x/bitcoin.png?1547033579 2x" src="https://assets.coingecko.com/coins/images/1/thumb/bitcoin.png?1547033579" srcset="https://assets.coingecko.com/coins/images/1/thumb_2x/bitcoin.png?1547033579 2x">
</div>
So we can use that to get the correct name like so:
import requests
from bs4 import BeautifulSoup
from os.path import basename
def getdata(url):
r = requests.get(url)
return r.text
htmldata = getdata("https://www.coingecko.com/en")
soup = BeautifulSoup(htmldata, 'html.parser')
for item1 in soup.select('.coin-icon img'):
link = item1.get('data-src').replace('thumb', 'thumb_2x')
raw_name = item1.get('alt')
name = raw_name[raw_name.find('(') + 1:-1]
with open(basename(name), "wb") as f:
f.write(requests.get(link).content)
We are basically extracting the value between the parenthesis using string slicing.
This is something you could do alternatively:
import requests
from bs4 import BeautifulSoup
from os.path import basename
url = "https://www.coingecko.com/en"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
for item1 in soup.select('td.coin-name[data-text]'):
ticker_name = item1.select_one(".center > span").get_text(strip=True)
image_link = item1.select_one(".coin-icon > img").get('data-src').replace('thumb','thumb_2x')
## with open(f"{basename(ticker_name)}.png", "wb") as f:
with open(basename(ticker_name), "wb") as f:
f.write(requests.get(image_link).content)
I believe you could achieve this very easily using string slicing:
import requests
from bs4 import BeautifulSoup
from os.path import basename
def getdata(url):
r = requests.get(url)
return r.text
htmldata = getdata("https://www.coingecko.com/en")
soup = BeautifulSoup(htmldata, 'html.parser')
for item1 in soup.select('.coin-icon img'):
link = item1.get('data-src').replace('thumb', 'thumb_2x')
with open(basename(link[:link.find('?')]), "wb") as f:
f.write(requests.get(link).content)
I am slicing a section of the link string using [:] and looking for the question mark that marks the beginning of the query.
Related
I am new to web scraping. I am trying to download only the weekly .zip files from the below website. I could able to parse the label and couldn't go beyond that to download the weekly zip files.
https://download.cms.gov/nppes/NPI_Files.html
html code for 'li' tag
import requests
from bs4 import BeautifulSoup
URL = "https://download.cms.gov/nppes/NPI_Files.html"
r = requests.get(URL)
#print(r.content)
soup = BeautifulSoup(r.content, 'html.parser')
#print(soup.prettify())
links=[]
ref = soup.select('li')
#print(ref)
for i in ref:
Try the next example
import requests
from bs4 import BeautifulSoup
URL = "https://download.cms.gov/nppes/NPI_Files.html"
r = requests.get(URL)
#print(r.content)
soup = BeautifulSoup(r.content, 'html.parser')
#print(soup.prettify())
links=[]
for ref in soup.select('div.bulletlistleft > ul > li')[2:]:
zip ='https://download.cms.gov/nppes'+ref.a.get('href').replace('./','/')
links.append(zip)
print(links)
Output:
['https://download.cms.gov/nppes/NPPES_Data_Dissemination_090522_091122_Weekly.zip', 'https://download.cms.gov/nppes/NPPES_Data_Dissemination_091222_091822_Weekly.zip', 'https://download.cms.gov/nppes/NPPES_Data_Dissemination_091922_092522_Weekly.zip']
Access a tags if there are any:
for i in ref:
if i.a is not None:
print(i.a.get('href'))
You can either just select all links then filter them down to the weekly updates:
ref = [
a for a in soup.select('a') if a.text
and 'Weekly Update' in a.text
]
OR you can narrow down to a parent element before selecting the links:
ref = soup.find(
'b', string='Weekly Incremental NPI Files'
).find_next('tr').select('a')
Either way, you'll end up with the same set of links:
The following code will find the links, download the files, and unzip them.
import requests
from bs4 import BeautifulSoup
import os
from urllib.request import urlretrieve
import zipfile
target_folder = os.path.dirname(os.path.realpath(__file__))
base_url = "https://download.cms.gov/nppes"
url = f"{base_url}/NPI_Files.html"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
elms = soup.select('div.bulletlistleft > ul > li > a')
for elm in elms:
zip_filename = elm.attrs['href'].lstrip('./')
zip_full_url = "/".join((base_url, zip_filename))
target_zip_path = os.path.join(target_folder, zip_filename)
target_zip_dir = ".".join(target_zip_path.split('.')[:-1])
urlretrieve(zip_full_url, target_zip_path)
with zipfile.ZipFile(target_zip_path, 'r') as zip_file:
zip_file.extractall(target_zip_dir)
I am trying to scrape the prices from a website and it's working but... I can't write the result to a text.file.
this is my python code.
import requests
from bs4 import BeautifulSoup as bs
url = "https://www.futbin.com/stc/cheapest"
r = requests.get(url)
soup = bs(r.content, "html.parser")
price = soup.find("div", {"class":"d-flex row col-md-9 px-0"})
name =("example")
f =open(name + '.txt', "a")
f.write(price.text)
This is not working but if I print it instead of try to write it to a textfile it's working. I have searched for a long time but don't understand it. I think it must be a string to write to a text file but don't know how to change the ouput to a string.
You're getting error due to unicode character.
Try to add encoding='utf-8' property while opening a file.
Also your code gives a bit messy output. Try this instead:
import requests
from bs4 import BeautifulSoup as bs
url = "https://www.futbin.com/stc/cheapest"
r = requests.get(url)
soup = bs(r.content, "html.parser")
rows = soup.find("div", {"class":"d-flex row col-md-9 px-0"})
prices = rows.findAll("span",{"class":"price-holder-row"})
names = rows.findAll("div",{"class":"name-holder"})
price_list = []
name_list = []
for price in prices:
price_list.append(price.text.strip("\n "))
for name in names:
name_list.append(name.text.split()[0])
name =("example")
with open(f"{name}.txt",mode='w', encoding='utf-8') as f:
for name, price in zip(name_list,price_list):
f.write(f"{name}:{price}\n")
I would like to extract all 'data-src' from this page. and then save the results to csv. There are several 'data-src' on this page all in the same class and I don't know how to deal with it.
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
from csv import writer
def test_list():
with open('largeXDDDDDDDDDD.csv','w') as f1:
writer=csv.writer(f1, delimiter='\t',lineterminator='\n',)
#df = pd.read_csv("C:\\Users\\Lukasz\\Desktop\\PROJEKTY PYTHON\\W TRAKCIE\\large.csv")
#url = df['LINKS'][1]
url='https://paypalshop.x.yupoo.com/albums/81513820?uid=1'
response = requests.get(url)
data = response.text
soup = BeautifulSoup(data, 'lxml')
szukaj=soup.find_all('div',{'class':"showalbum__children image__main"})
for XD in szukaj:
q=(soup.find_all("data-src"))
print(q)
#q= soup.find("img", {"class": "autocover image__img image__portrait"})
#q=(tag.get('data-src'))
test_list()```
HTML:
<div class="showalbum__children image__main" data-id="30842210">
<div class="image__imagewrap" data-type="photo">
<img alt="" class="autocover image__img image__portrait" data-album-id="83047567" data-frame="1" data-height="1080" data-origin-src="//photo.yupoo.com/ven-way/aac32ed1/2d2ed235.jpg" data-path="/ven-way/aac32ed1/2d2ed235.jpg" data-src="//photo.yupoo.com/ven-way/aac32ed1/big.jpg" data-type="photo" data-videoformats="" data-width="1080" src="//photo.yupoo.com/ven-way/aac32ed1/small.jpg"/>
<div class="image__clickhandle" data-photoid="30842210" style="width: 1080px; padding-bottom: 100.00%" title="点击查看详情">
</div>
Use a class selector for one of the children of the ones you are currently looking at to be at the right level. I use select and dict accessor notation to retrieve attribute. You cannot use with the syntax as you have written it.
from bs4 import BeautifulSoup
import csv
import pandas as pd
from csv import writer
def test_list():
#with open('largeXDDDDDDDDDD.csv','w') as f1:
#writer=csv.writer(f1, delimiter='\t',lineterminator='\n',)
url='https://paypalshop.x.yupoo.com/albums/81513820?uid=1'
response = requests.get(url)
data = response.content
soup = BeautifulSoup(data, 'lxml')
szukaj = soup.select('.image__portrait')
for x in szukaj:
q = x['data-src']
print(q)
test_list()
I'm trying to extract specific classes from multiple URLs. The tags and classes stay the same but I need my python program to scrape all as I just input my link.
Here's a sample of my work:
from bs4 import BeautifulSoup
import requests
import pprint
import re
import pyperclip
url = input('insert URL here: ')
#scrape elements
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
#print titles only
h1 = soup.find("h1", class_= "class-headline")
print(h1.get_text())
This works for individual URLs but not for a batch. Thanks for helping me. I learned a lot from this community.
Have a list of urls and iterate through it.
from bs4 import BeautifulSoup
import requests
import pprint
import re
import pyperclip
urls = ['www.website1.com', 'www.website2.com', 'www.website3.com', .....]
#scrape elements
for url in urls:
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
#print titles only
h1 = soup.find("h1", class_= "class-headline")
print(h1.get_text())
If you are going to prompt user for input for each site then it can be done this way
from bs4 import BeautifulSoup
import requests
import pprint
import re
import pyperclip
urls = ['www.website1.com', 'www.website2.com', 'www.website3.com', .....]
#scrape elements
msg = 'Enter Url, to exit type q and hit enter.'
url = input(msg)
while(url!='q'):
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
#print titles only
h1 = soup.find("h1", class_= "class-headline")
print(h1.get_text())
input(msg)
If you want to scrape links in batches. Specify a batch size and iterate over it.
from bs4 import BeautifulSoup
import requests
import pprint
import re
import pyperclip
batch_size = 5
urllist = ["url1", "url2", "url3", .....]
url_chunks = [urllist[x:x+batch_size] for x in xrange(0, len(urllist), batch_size)]
def scrape_url(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
h1 = soup.find("h1", class_= "class-headline")
return (h1.get_text())
def scrape_batch(url_chunk):
chunk_resp = []
for url in url_chunk:
chunk_resp.append(scrape_url(url))
return chunk_resp
for url_chunk in url_chunks:
print scrape_batch(url_chunk)
I have a problem to cut the url that i get as result from Beautifulsoup, i've used this code to retrieve the url.
import urllib2
from bs4 import BeautifulSoup
url = 'http://192.168.0.184:88/cgi-bin/CGIProxy.fcgi? cmd=snapPicture&usr=USER&pwd=PASS'
html = urllib2.urlopen(url)
soup = BeautifulSoup(html, "html5lib")
imgs = soup.findAll("img")
print imgs
print imgs[1:]
As result from print imgs i get [<img src="../snapPic/Snap_20160401-110642.jpg"/>]
I want to cut the unwanted characters from this string so i try to use for eg. print imgs[1:] but as result i get []
Any tips or solutions?
I want to rebuild the imgs string to the correct image url
imgs string = <img src="../snapPic/Snap_20160401-110642.jpg"/>
wanted result = http://192.168.0.184:88/snapPic/Snap_20160401-110642.jpg
try this
import urllib2
from bs4 import BeautifulSoup
url = 'http://192.168.0.184:88/cgi-bin/CGIProxy.fcgi? cmd=snapPicture&usr=USER&pwd=PASS'
html = urllib2.urlopen(url)
soup = BeautifulSoup(html, "html5lib")
imgs = soup.findAll("img")
print imgs
for img in imgs:
print img["src"].replace("..","http://192.168.0.184:88")