how to bypass googletagmanager while scraping

how to bypass googletagmanager while scraping - python

When site added script googletagmanadger i cant get what I need. With this code I was scraping links from
now Im getting "www.googletagmanager.com" in every row... So I dont know how to handle with that.Thank you
[HTML][1]
[HOW CSV FILE NOW LOOK][2]
from bs4 import BeautifulSoup
import csv
import pandas as pd
from csv import writer
data_list = ["LINKI", "GOWNO", "JAJCO"]
with open('innovators.csv', 'w', newline='') as file:
writer = csv.writer(file, delimiter=',')
writer.writerow(data_list)
for i in range(0,50):
#df = pd.read_csv("C:\\Users\\Lukasz\\Desktop\\PROJEKTY PYTHON\\W TRAKCIE\\bf3_strona2.csv")
#url = "https://bf3.space/" + df['LINKS'][i]
url='https://bf3.space/a-Byu6am3P'
response = requests.get(url)
data = response.text
soup = BeautifulSoup(data, 'lxml')
rows = soup.find('iframe')
q = (rows.get('src'))
writer.writerow([q])
[1]: https://i.stack.imgur.com/Ogq0N.png
[2]: https://i.stack.imgur.com/3JYqc.png

You can use soup.find() with lambda.
For example:
import requests
from bs4 import BeautifulSoup
url = 'https://bf3.space/a-Byu6am3P'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
print( soup.find('iframe', src=lambda s: 'googletagmanager.com' not in s) )
Prints first non-googletagmanager <iframe> tag:
<iframe align="center" frameborder="0" height="1500" src="https://ven-way.x.yupoo.com/albums/83591895?uid=1" style="margin: 10px 0;padding: 0px 0px; border:none" width="100%"></iframe>

Related

How can I save scraped date from soup object into CSV?

I am looking to only save scraped date into a CSV file.
This is the scraped data and code:
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-
SkillsNetwork/labs/datasets/Programming_Languages.html"
from bs4 import BeautifulSoup
import requests
data = requests.get(url).text
soup = BeautifulSoup(data,"html5lib")
table = soup.find('table')
for row in table.find_all('tr'):
cols = row.find_all('td')
programing_language = cols[1].getText()
salary = cols[3].getText()
print("{}--->{}".format(programing_language,salary))

Here is the solution.
import pandas as pd
from bs4 import BeautifulSoup
import requests
data=[]
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/datasets/Programming_Languages.html"
from bs4 import BeautifulSoup
import requests
data = requests.get(url).text
soup = BeautifulSoup(data,"html5lib")
table = soup.find('table')
for row in table.find_all('tr'):
cols = row.find_all('td')
programing_language = cols[1].getText()
salary = cols[3].getText()
data.append([programing_language,salary])
#print("{}--->{}".format(programing_language,salary))
cols=['programing_language','salary']
df = pd.DataFrame(data,columns=cols)
df.to_csv("data.csv", index=False)

For a lightweight solution you can just use csv. Ignore headers row by using tr:nth-child(n+2). This nth-child range selector selects from the second tr. Then within a loop over the subsequent rows, select for the second and fourth columns as follows:
from bs4 import BeautifulSoup as bs
import requests, csv
response = requests.get('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/datasets/Programming_Languages.html',
headers={'User-Agent': 'Mozilla/5.0'})
soup = bs(response.content, 'lxml')
with open("programming.csv", "w", encoding="utf-8-sig", newline='') as f:
w = csv.writer(f, delimiter=",", quoting=csv.QUOTE_MINIMAL)
w.writerow(["Language", "Average Annual Salary"])
for item in soup.select('tr:nth-child(n+2)'):
w.writerow([item.select_one('td:nth-child(2)').text,
item.select_one('td:nth-child(4)').text])

Using Beautiful Soup on multiple URLs

I have searched through a lot of similar questions, but I'm unable to resolve the issue with the code below.
I am trying to scrape the same information from 2 separate URLs.
There is no issue when I scrape 1 URL (code 1). I then attempt to for loop through multiple URLs (code 2) and it throws this error:
ResultSet object has no attribute 'find_all'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?
Is it a case that the line where the error is returned (highlighted below) should not be included within the For Loop? (I have tried this unsuccessfully)
Could someone please educate me in why this is not working (my guess would be that the structure is wrong in someway - but I've been unable to adjust it correctly), or if this is infact not the optimal method at all
First code:
import csv
import pandas as pd
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as ureq
import numpy as np
import re
url = "https://www.espncricinfo.com/series/pepsi-indian-premier-league-2014-695871/chennai-super-kings-vs-royal-challengers-bangalore-42nd-match-734013/full-scorecard"
url_contents = ureq(url) #opening the URL
soup = soup(url_contents,"html.parser") #parse the
batsmen = soup.find_all("table", { "class":["table batsman"]})
bowlers = soup.find_all("table", { "class":["table bowler"]})
for batsman in batsmen[0]:
with open('testcsv3.csv', 'a',newline='') as csvfile:
f = csv.writer(csvfile)
print (batsmen)
for x in batsman:
rows = batsman.find_all('tr')[:-2] #find all tr tag(rows)
for tr in rows:
data=[]
cols = tr.find_all('td') #find all td tags(columns)
for td in cols:
data.append(td.text.strip())
f.writerow(data)
print(data)
for bowler in bowlers[1]:
with open('testcsv3.csv', 'a',newline='') as csvfile:
f = csv.writer(csvfile)
print (bowlers)
for x in bowler:
rows = bowler.find_all('tr') #find all tr tag(rows)
for tr in rows:
data=[]
cols = tr.find_all('td') #find all td tags(columns)
for td in cols:
data.append(td.text.strip())
f.writerow(data)
print(data)
Second code:
import csv # to do operations on CSV
import pandas as pd # file operations
from bs4 import BeautifulSoup as soup #Scraping tool
from urllib.request import urlopen as ureq # For requesting data from link
import numpy as np
import re
urls = ["https://www.espncricinfo.com/series/pepsi-indian-premier-league-2014-695871/chennai-super-kings-vs-royal-challengers-bangalore-42nd-match-734013/full-scorecard",
"https://www.espncricinfo.com/series/pepsi-indian-premier-league-2014-695871/chennai-super-kings-vs-kolkata-knight-riders-21st-match-733971/full-scorecard"]
for url in urls:
url_contents = ureq(url) #opening the URL
soup = soup(url_contents,"html.parser") #parse the
**batsmen = soup.find_all("table", { "class":["table batsman"]})** #error here
bowlers = soup.find_all("table", { "class":["table bowler"]})
for batsman in batsmen[0]:
with open('testcsv3.csv', 'a',newline='') as csvfile:
f = csv.writer(csvfile)
print (batsmen)
for x in batsman:
rows = batsman.find_all('tr')[:-2] #find all tr tag(rows)
for tr in rows:
data=[]
cols = tr.find_all('td') #find all td tags(columns)
for td in cols:
data.append(td.text.strip())
f.writerow(data)
print(data)
for bowler in bowlers[1]:
with open('testcsv3.csv', 'a',newline='') as csvfile:
f = csv.writer(csvfile)
print (bowlers)
for x in bowler:
rows = bowler.find_all('tr') #find all tr tag(rows)
for tr in rows:
data=[]
cols = tr.find_all('td') #find all td tags(columns)
for td in cols:
data.append(td.text.strip())
f.writerow(data)
print(data)

Your problem is because you use the same name soup for class/function soup(...) and for result soup = ... - and you run it in loop.
from bs4 import BeautifulSoup as soup
for url in urls:
soup = soup(...)
In first loop all work correctly but class/function soup() is replaces by result soup = ... and in next loop it tries to use result soup as a class/function - and this makes problem.
In first code you run soup = soup() only once so it makes no problem.
If you use different names - ie. BeautifoulSoup instead of soup - then it will work
from bs4 import BeautifulSoup
for url in urls:
soup = BeautifulSoup(...)
BTW:
In second code you have wrong indentations - you should run for batsman in ... and for bowler in ... inside for url in urls: but you run it outside (after exiting from loop for url in urls:) and this will give you results only for last url

You can use request lib and try this
import requests as req
from bs4 import BeautifulSoup
urls = ["https://www.espncricinfo.com/series/pepsi-indian-premier-league-2014-695871/chennai-super-kings-vs-royal-challengers-bangalore-42nd-match-734013/full-scorecard",
"https://www.espncricinfo.com/series/pepsi-indian-premier-league-2014-695871/chennai-super-kings-vs-kolkata-knight-riders-21st-match-733971/full-scorecard"]
for url in urls:
otp = req.get(url)
if otp.ok:
soup = BeautifulSoup(otp.text,'lxml')
batsmen = soup.find_all('table', {'class': 'table batsman'})
bowlers = soup.find_all('table', {'class': 'table bowler'})
for bat in batsmen:
print(bat.find_all('td')) # here you can use find/find_all method
for bowl in bowlers:
print(bowl.find_all('td')) # here you can use find/find_all method

How to change names of scraped images with Python?

So I need to download the images of every coin on the list on CoinGecko, so I wrote the following code:
import requests
from bs4 import BeautifulSoup
from os.path import basename
def getdata(url):
r = requests.get(url)
return r.text
htmldata = getdata("https://www.coingecko.com/en")
soup = BeautifulSoup(htmldata, 'html.parser')
for item1 in soup.select('.coin-icon img'):
link = item1.get('data-src').replace('thumb', 'thumb_2x')
with open(basename(link), "wb") as f:
f.write(requests.get(link).content)
However, I need to save the images with their names being the same as the ticker of the coin of that list from CoinGecko (rename bitcoin.png?1547033579 to BTC.png, ethereum.png?1595348880 to ETH.png, and so forth). There are over 7000 images that need to be renamed, and many of them have quite unique names, so slicing does not work here.
What is the way to do it?

I was browsing the html file and I found that the tag you are looking at has an alt parameter that has the ticker on the end of the string.
<div class="coin-icon mr-2 center flex-column">
<img class="" alt="bitcoin (BTC)" data-src="https://assets.coingecko.com/coins/images/1/thumb/bitcoin.png?1547033579" data-srcset="https://assets.coingecko.com/coins/images/1/thumb_2x/bitcoin.png?1547033579 2x" src="https://assets.coingecko.com/coins/images/1/thumb/bitcoin.png?1547033579" srcset="https://assets.coingecko.com/coins/images/1/thumb_2x/bitcoin.png?1547033579 2x">
</div>
So we can use that to get the correct name like so:
import requests
from bs4 import BeautifulSoup
from os.path import basename
def getdata(url):
r = requests.get(url)
return r.text
htmldata = getdata("https://www.coingecko.com/en")
soup = BeautifulSoup(htmldata, 'html.parser')
for item1 in soup.select('.coin-icon img'):
link = item1.get('data-src').replace('thumb', 'thumb_2x')
raw_name = item1.get('alt')
name = raw_name[raw_name.find('(') + 1:-1]
with open(basename(name), "wb") as f:
f.write(requests.get(link).content)
We are basically extracting the value between the parenthesis using string slicing.

This is something you could do alternatively:
import requests
from bs4 import BeautifulSoup
from os.path import basename
url = "https://www.coingecko.com/en"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
for item1 in soup.select('td.coin-name[data-text]'):
ticker_name = item1.select_one(".center > span").get_text(strip=True)
image_link = item1.select_one(".coin-icon > img").get('data-src').replace('thumb','thumb_2x')
## with open(f"{basename(ticker_name)}.png", "wb") as f:
with open(basename(ticker_name), "wb") as f:
f.write(requests.get(image_link).content)

I believe you could achieve this very easily using string slicing:
import requests
from bs4 import BeautifulSoup
from os.path import basename
def getdata(url):
r = requests.get(url)
return r.text
htmldata = getdata("https://www.coingecko.com/en")
soup = BeautifulSoup(htmldata, 'html.parser')
for item1 in soup.select('.coin-icon img'):
link = item1.get('data-src').replace('thumb', 'thumb_2x')
with open(basename(link[:link.find('?')]), "wb") as f:
f.write(requests.get(link).content)
I am slicing a section of the link string using [:] and looking for the question mark that marks the beginning of the query.

Extracting each same tag from the same class beautifulsoup

I would like to extract all 'data-src' from this page. and then save the results to csv. There are several 'data-src' on this page all in the same class and I don't know how to deal with it.
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
from csv import writer
def test_list():
with open('largeXDDDDDDDDDD.csv','w') as f1:
writer=csv.writer(f1, delimiter='\t',lineterminator='\n',)
#df = pd.read_csv("C:\\Users\\Lukasz\\Desktop\\PROJEKTY PYTHON\\W TRAKCIE\\large.csv")
#url = df['LINKS'][1]
url='https://paypalshop.x.yupoo.com/albums/81513820?uid=1'
response = requests.get(url)
data = response.text
soup = BeautifulSoup(data, 'lxml')
szukaj=soup.find_all('div',{'class':"showalbum__children image__main"})
for XD in szukaj:
q=(soup.find_all("data-src"))
print(q)
#q= soup.find("img", {"class": "autocover image__img image__portrait"})
#q=(tag.get('data-src'))
test_list()```
HTML:
<div class="showalbum__children image__main" data-id="30842210">
<div class="image__imagewrap" data-type="photo">
<img alt="" class="autocover image__img image__portrait" data-album-id="83047567" data-frame="1" data-height="1080" data-origin-src="//photo.yupoo.com/ven-way/aac32ed1/2d2ed235.jpg" data-path="/ven-way/aac32ed1/2d2ed235.jpg" data-src="//photo.yupoo.com/ven-way/aac32ed1/big.jpg" data-type="photo" data-videoformats="" data-width="1080" src="//photo.yupoo.com/ven-way/aac32ed1/small.jpg"/>
<div class="image__clickhandle" data-photoid="30842210" style="width: 1080px; padding-bottom: 100.00%" title="点击查看详情">
</div>

Use a class selector for one of the children of the ones you are currently looking at to be at the right level. I use select and dict accessor notation to retrieve attribute. You cannot use with the syntax as you have written it.
from bs4 import BeautifulSoup
import csv
import pandas as pd
from csv import writer
def test_list():
#with open('largeXDDDDDDDDDD.csv','w') as f1:
#writer=csv.writer(f1, delimiter='\t',lineterminator='\n',)
url='https://paypalshop.x.yupoo.com/albums/81513820?uid=1'
response = requests.get(url)
data = response.content
soup = BeautifulSoup(data, 'lxml')
szukaj = soup.select('.image__portrait')
for x in szukaj:
q = x['data-src']
print(q)
test_list()

How can I scrape multiple pages using Beautiful Soup?

How can I scrape multiple pages from a website? This code is only working for the first one:
import csv
import requests
from bs4 import BeautifulSoup
import datetime
filename = "azet_" + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")+".csv"
with open(filename, "w+") as f:
writer = csv.writer(f)
writer.writerow(["Descriere","Pret","Data"])
r = requests.get("https://azetshop.ro/12-extensa?page=1")
soup = BeautifulSoup(r.text, "html.parser")
x = soup.find_all("div", "thumbnail")
for thumbnail in x:
descriere = thumbnail.find("h3").text.strip()
pret = thumbnail.find("price").text.strip()
writer.writerow([descriere, pret, datetime.datetime.now()])

For multiple pages scraping using BeautifulSoup, many usually do it using while:
import csv
import requests
from bs4 import BeautifulSoup
import datetime
end_page_num = 50
filename = "azet_" + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")+".csv"
with open(filename, "w+") as f:
writer = csv.writer(f)
writer.writerow(["Descriere","Pret","Data"])
i = 1
while i <= end_page_num:
r = requests.get("https://azetshop.ro/12-extensa?page={}".format(i))
soup = BeautifulSoup(r.text, "html5lib")
x = soup.find_all("div", {'class': 'thumbnail-container'})
for thumbnail in x:
descriere = thumbnail.find('h1', {"class": "h3 product-title"}).text.strip()
pret = thumbnail.find('span', {"class": "price"}).text.strip()
writer.writerow([descriere, pret, datetime.datetime.now()])
i += 1
Here i will change with increment of 1 as scraping of a page is completed.
This will continue scraping till end_page_num you have defined.

This code works fine too to use class attribute with bs4:
import csv
import requests
from bs4 import BeautifulSoup
import datetime
filename = "azet_" + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")+".csv"
with open(filename, "w+") as f:
writer = csv.writer(f)
writer.writerow(["Descriere","Pret","Data"])
for i in range(1,50):
r = requests.get("https://azetshop.ro/12-extensa?page="+format(i))
soup = BeautifulSoup(r.text, "html.parser")
array_price= soup.find_all('span', class_='price')
array_desc=soup.find_all('h1', class_='h3 product-title',text=True)
for iterator in range(0,len(array_price)):
descriere = array_desc[iterator].text.strip()
pret = array_price[iterator].text.strip()
writer.writerow([descriere, pret, datetime.datetime.now()])

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

how to bypass googletagmanager while scraping - python

Related

How can I save scraped date from soup object into CSV?

Using Beautiful Soup on multiple URLs

How to change names of scraped images with Python?

Extracting each same tag from the same class beautifulsoup

How can I scrape multiple pages using Beautiful Soup?

Categories

Resources