Webscraping coordinates of a polygon with Python and BeautifulSoup - python

I'm trying to scrape information from this webpage and many similar, https://knowyourcity.info/settlement/1846/5119249
When viewing the page source the coordinates for the polygon at the top of the page are available but not when inspecting the polygon element. Would anyone know how to scrape these coordinates into a column of a dataframe using BeautifulSoup package in python?
This is the code I used to access the website
from requests import get
url = 'http://knowyourcity.info/settlement/1846/5119249'
response = get(url)
print(response.text[:500])
from bs4 import BeautifulSoup
html_soup = BeautifulSoup(response.text, "html.parser")
type(html_soup)

It looks like the map is driven by the variable settlement. Therefore one option
is to loop through all the scripts tags and search for var settlement. Once you've found the variable, use simple find and subscribing to get the variable data. Convert this to json and then return the boundaries.
The example is for illustration purposes. You'll most likely want to refactor the code:
from requests import get
from bs4 import BeautifulSoup
import json
def getHtml():
url = 'http://knowyourcity.info/settlement/1846/5119249'
response = get(url)
return response.text
def extractBoundaries(html):
html_soup = BeautifulSoup(html, "html.parser")
scripts = html_soup.find_all('script')
for script in scripts:
startFind = "var settlement = "
endFind = "};"
if script.contents and startFind in script.contents[0]:
scriptText = script.contents[0]
startIndex = scriptText.find(startFind) + len(startFind)
endIndex = scriptText.find(endFind) + len(endFind) -1
settlementData = scriptText[startIndex:endIndex]
jsonData = json.loads(settlementData)
return jsonData['verification/A0_Boundary']
html = getHtml()
results = extractBoundaries(html)
print(results)
Output:
5.599769999885609 -0.224459999729163 0 0;5.599920830581937 -0.2235293057328249 0 0;5.600343984087658 -0.2220772405721618 0 0;5.600582171330188 -0.2212706242398781 0 0;5.600757735181389 -0.2203650797845285 0 0;5.600943331869303 -0.2195227513738018 0 0;5.601229999764712 -0.2178069995933356 0 0;5.601684627743396 -0.2160719483616731 0 0;5.602178000314495 -0.215115999603654 0 0;5.60277082980997 -0.213977987593978 0 0;5.60322584449716 -0.2131045282513355 0 0;5.603939996133988 -0.2117290691411995 0 0;5.604261867990886 -0.2111080629866819 0 0;5.604746000027944 -0.210174000129939 0 0;5.605512212518647 -0.208745954062465 0 0;5.605957084651777 -0.2079168151088879 0 0;5.60642700020594 -0.2070410004417909 0 0;5.606837000227415 -0.2063009995914058 0 0;5.607503034537444 -0.2072989224072899 0 0;5.608332999968013 -0.2085879998362543 0 0;5.608940827457275 -0.2094694811315776 0 0;5.609384837140567 -0.2101133921192968 0 0;5.609949999892649 -0.210933000057878 0 0;5.610520744736618 -0.2114266172445696 0 0;5.61105999981919 -0.2118930002616821 0 0;5.612419000436546 -0.2126160003281257 0 0;5.613144659798252 -0.2126897915006225 0 0;5.614907000058054 -0.2128690003040674 0 0;5.615398000217567 -0.2144450001366067 0 0;5.615173904452149 -0.2159211302559356 0 0;5.614935501372315 -0.2174915048290131 0 0;5.61470415976919 -0.2190153628686744 0 0;5.614495076386731 -0.2203926071330784 0 0;5.61425499966856 -0.2219740001999071 0 0;5.613865981729703 -0.2233052558328268 0 0;5.613273865396593 -0.2253315354219581 0 0;5.612689000297166 -0.227333000017893 0 0;5.611838309990048 -0.2274067552175438 0 0;5.611219650166788 -0.2272163984180224 0 0;5.610458222968646 -0.2271212195685735 0 0;5.609547010985807 -0.2272079061199293 0 0;5.608730734136145 -0.2266937097468826 0 0;5.607481517358167 -0.2262178181977106 0 0;5.605377060602905 -0.2259990644052436 0 0;5.603420000032998 -0.2258499999774699 0 0;5.602499999875136 -0.2257000002547329 0 0;5.601491149397077 -0.225320574484897 0 0;5.599769999885609 -0.224459999729163 0 0

Related

Scrapping pdf links in dataframe from webpage using BeautifulSoup

I want to extract all the pdf links which takes us to the page directly from where we can download all the pdfs . I want to store these pdfs in a data frame
url = "https://www.volvogroup.com/en/news-and-media/press-releases.html"
source = requests.get(url)
soup = BeautifulSoup(source.text , "html.parser")
news_check = soup.find_all("a" , class_ = "articlelist__contentDownloadItem")
for i in news_check :
print(i)
break
data = set()
for i in soup.find_all('a'):
for j in i.find_all('href'):
pdf_link = "https://www.volvogroup.com" + j.get('.pdf')
data.add(j)
print(pdf_link)
You can try below code to get pdf link:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
url = "https://www.volvogroup.com/en/news-and-media/press-releases.html"
source = requests.get(url)
soup = bs(source.text , "html.parser")
news_check = soup.find_all("a" , class_ = "articlelist__contentDownloadItem")
data = set()
for i in news_check:
pdf_link ="https://www.volvogroup.com" + i['href']
data.add(pdf_link)
# for j in i.find_all('href'):
# pdf_link = + j.get('.pdf')
# data.add(j)
# print(pdf_link)
df = pd.DataFrame(data)
print(df)
Output :
0 https://www.volvogroup.com/content/dam/volvo-g...
1 https://www.volvogroup.com/content/dam/volvo-g...
2 https://www.volvogroup.com/content/dam/volvo-g...
3 https://www.volvogroup.com/content/dam/volvo-g...
4 https://www.volvogroup.com/content/dam/volvo-g...
5 https://www.volvogroup.com/content/dam/volvo-g...
6 https://www.volvogroup.com/content/dam/volvo-g...
7 https://www.volvogroup.com/content/dam/volvo-g...
8 https://www.volvogroup.com/content/dam/volvo-g...
9 https://www.volvogroup.com/content/dam/volvo-g...
10 https://www.volvogroup.com/content/dam/volvo-g...
11 https://www.volvogroup.com/content/dam/volvo-g...
12 https://www.volvogroup.com/content/dam/volvo-g...

How to grab iframe data range from a page in python and beautifulsoup

I am trying to grab the iframe data of a page. I found some resources that works but I cant make it work to get the data I wanted.
import requests
from bs4 import BeautifulSoup
s = requests.Session()
r = s.get("https://bscscan.com/token/0xe56842ed550ff2794f010738554db45e60730371#balances")
soup = BeautifulSoup(r.content, "html.parser")
iframe_src = soup.select_one("#tokeholdersiframe").attrs["src"]
r = s.get(f"https:{iframe_src}")
soup = BeautifulSoup(r.content, "html.parser")
for row in rowsblockdetails[1:]:
rank = row.find_all('td')[0].text[0:].strip()
address = row.find_all('td')[1].text[0:].strip()
amount = row.find_all('td')[2].text[0:].strip()
percentage = row.find_all('td')[3].text[0:]
print (" {:<3} {:<25} {:>15} {:>10} ".format(rank, address, amount, percentage))
Current Output: # Errors
Traceback (most recent call last):
r = s.get(f"https:{iframe_src}")
return self.request('GET', url, **kwargs)
raise InvalidURL("Invalid URL %r: No host supplied" % url)
Wanted Output:
1 UniCrypt: Token Vesting 150,000,000 15.0451% Contract
2 PancakeSwap V2: BIN 17 77,320,752.850881264572940617 8.1141% Contract
3 0xa36b9dc17e421d86ddf8e490dafa87344e76125b 49,463,154.04616156547917712 4.9612%
4 0xbbda05ea467ad348212dade5c38c11910c14e83e 48,704,064.094074959661726945 4.8769%
5 0xcfdb8569fb546a010bb22b5057679c4053d4a231 11,493,129.656390775184191781 1.1528%
You can get the iframe HTML directly, all you need is the address:
import requests
from bs4 import BeautifulSoup
s = requests.Session()
iframe_src = "https://bscscan.com/token/generic-tokenholders2?m=normal&a=0xe56842ed550ff2794f010738554db45e60730371"
r = s.get(iframe_src)
soup = BeautifulSoup(r.content, "html.parser")
for row in soup.select("tr:has(td)"):
rank = row.find_all("td")[0].text[0:].strip()
address = row.find_all("td")[1].text[0:].strip()
amount = row.find_all("td")[2].text[0:].strip()
percentage = row.find_all("td")[3].text[0:]
print(
" {:<3} {:<45} {:^35} {:>10} ".format(rank, address, amount, percentage)
)
Prints:
1 UniCrypt: Token Vesting 150,000,000 0.0000%
2 PancakeSwap V2: BIN 17 76,926,258.749406306830460162 0.0000%
3 0xa36b9dc17e421d86ddf8e490dafa87344e76125b 49,463,154.04616156547917712 0.0000%
4 0xbbda05ea467ad348212dade5c38c11910c14e83e 48,770,394.148172297052962199 0.0000%
5 0xcfdb8569fb546a010bb22b5057679c4053d4a231 11,493,129.656390775184191781 0.0000%
6 0xe56842ed550ff2794f010738554db45e60730371 10,236,437.028812018664646028 0.0000%
7 0x1a1db1616854b4fe4723925f7129188483500eb6 10,000,000 0.0000%
8 0xa8b398896d67cea6d26fc140e056f745261c4b00 9,024,167.759368544603338806 0.0000%
9 0x0d9b1e53cbb251572d982d9f96520e8d40d22bb0 7,200,000 0.0000%
10 0x934f895383a6eb7d8a8cfd6c894f7fb57ad5f2b1 7,078,575.994558878120028183 0.0000%
11 0x739621ea040cf397a169047df2935458c6502d63 7,000,000 0.0000%
12 0xc7129c10f8056986716effffbbe0f1e9c80622d8 5,307,909.106163478741030017 0.0000%
13 0x43e5959343cd9154080c235c16fbb4bbd7f83e70 5,215,489.453108218359291854 0.0000%
14 0xa7a9544d86066bf583be602195536918497b1fff 5,000,000 0.0000%
15 0x81da471feb4a45438053dc05e709be056ec26c39 4,900,400 0.0000%
16 0xc68446c2a2e03e932774c3353629b9979b380c72 4,846,096.854970087386140103 0.0000%
17 0xb6957013a430e4cf509a4c51002073c1b24356e2 4,641,857.889152974553322217 0.0000%
18 0x4b0005c7bba3e10820b5b3a2863821e00701b383 4,570,260.896212362994501438 0.0000%
19 0xe02752824b6b11e027080e75f692bd22b3dc7091 4,388,894.363703154394892711 0.0000%
...and so on.

BeautifulSoup trying to get text from wrapped divs but empty or "none" is being returned

Here is a picture (sorry) of the HTML that I am trying to parse:
I am using this line:
home_stats = soup.select_one('div', class_='statText:nth-child(1)').text
Thinking that I'd get the 1st child of the class statText and the outcome would be 53%.
But it's not. I get "Loading..." and none of the data that I was trying to use and display.
The full code I have so far:
soup = BeautifulSoup(source, 'lxml')
home_team = soup.find('div', class_='tname-home').a.text
away_team = soup.find('div', class_='tname-away').a.text
home_score = soup.select_one('.current-result .scoreboard:nth-child(1)').text
away_score = soup.select_one('.current-result .scoreboard:nth-child(2)').text
print("The home team is " + home_team, "and they scored " + home_score)
print()
print("The away team is " + away_team, "and they scored " + away_score)
home_stats = soup.select_one('div', class_='statText:nth-child(1)').text
print(home_stats)
Which currently does print the hone and away team and the number of goals they scored. But I can't seem to get any of the statistical content from this site.
My output plan is to have:
[home_team] had 53% ball possession and [away_team] had 47% ball possession
However, I would like to remove the "%" symbols from the parse (but that's not essential). My plan is to use these numbers for more stats later on, so the % symbol gets in the way.
Apologies for the noob question - this is the absolute beginning of my Pythonic journey. I have scoured the internet and StackOverflow and just can not find this situation - I also possibly don't know exactly what I am looking for either.
Thanks kindly for your help! May your answer be the one I pick as "correct" ;)
Assuming that this is the website that u r tryna scrape, here is the complete code to scrape all the stats:
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
driver = webdriver.Chrome('chromedriver.exe')
driver.get('https://www.scoreboard.com/en/match/SO3Fg7NR/#match-statistics;0')
pg = driver.page_source #Gets the source code of the page
driver.close()
soup = BeautifulSoup(pg,'html.parser') #Creates a soup object
statrows = soup.find_all('div',class_ = "statTextGroup") #Finds all the div tags with class statTextGroup -- these div tags contain the stats
#Scrapes the team names
teams = soup.find_all('a',class_ = "participant-imglink")
teamslst = []
for x in teams:
team = x.text.strip()
if team != "":
teamslst.append(team)
stats_dict = {}
count = 0
for x in statrows:
txt = x.text
final_txt = ""
stat = ""
alphabet = False
percentage = False
#Extracts the numbers from the text
for c in txt:
if c in '0123456789':
final_txt+=c
else:
if alphabet == False:
final_txt+= "-"
alphabet = True
if c != "%":
stat += c
else:
percentage = True
values = final_txt.split('-')
#Appends the values to the dictionary
for x in values:
if stat in stats_dict.keys():
if percentage == True:
stats_dict[stat].append(x + "%")
else:
stats_dict[stat].append(int(x))
else:
if percentage == True:
stats_dict[stat] = [x + "%"]
else:
stats_dict[stat] = [int(x)]
count += 1
if count == 15:
break
index = [teamslst[0],teamslst[1]]
#Creates a pandas DataFrame out of the dictionary
df = pd.DataFrame(stats_dict,index = index).T
print(df)
Output:
Burnley Southampton
Ball Possession 53% 47%
Goal Attempts 10 5
Shots on Goal 2 1
Shots off Goal 4 2
Blocked Shots 4 2
Free Kicks 11 10
Corner Kicks 8 2
Offsides 2 1
Goalkeeper Saves 0 2
Fouls 8 10
Yellow Cards 1 0
Total Passes 522 480
Tackles 15 12
Attacks 142 105
Dangerous Attacks 44 29
Hope that this helps!
P.S: I actually wrote this code for a different question, but I didn't post it as an answer was already posted! But I didn't know that it would come in handy now! Anyways, I hope that my answer does what u need.

Is there any way to get the cookies and cache of a visited website from chrome to beautifulsoup in Python?

I want to scrape a certain website weather data but the default page layout gives max of 40 results but when layout changed to simple list gives 100 results and the layout is set to default which is difficult to achieve with selenium. Is there any way to get the cookies saved in chrome to be used with beautiful soup.
import requests
from bs4 import BeautifulSoup
import browser_cookie3
cj = browser_cookie3.load()
s = requests.Session()
url = "https:/something.org/titles/2"
i=1
print(cj)
for c in cj:
if 'mangadex' in str(c):
s.cookies.set_cookie(c)
r = s.get(url)
soup = BeautifulSoup(r.content, 'lxml')
for anime in soup.find_all('div', {'class': 'manga-entry col-lg-6 border-bottom pl-0 my-1'}):
det = anime.find('a', {"class": "ml-1 manga_title text-truncate"})
anime_name = det.text
anime_link = det['href']
stars = anime.select("span")[3].text
print(anime_name, anime_link, stars,i)
i=i+1
Try:
import browser_cookie3
import requests
cj = browser_cookie3.load()
s = requests.Session()
for c in cj:
if 'sitename' in str(c):
s.cookies.set_cookie(c)
r = s.get(the_site)
This code use the browsers cookies in the requests module in as Session. Simply change sitename to the site you want cookies from.
Your new code:
import requests
from bs4 import BeautifulSoup
import browser_cookie3
cj = browser_cookie3.load()
s = requests.Session()
url = "https://something.org/titles/2"
i = 1
print(cj)
for c in cj:
if 'mangadex' in str(c):
s.cookies.set_cookie(c)
r = s.get(url)
soup = BeautifulSoup(r.content, 'lxml')
for anime in soup.find_all('div', {'class': 'manga-entry row m-0 border-bottom'}):
det = anime.find('a', {"class": "ml-1 manga_title text-truncate"})
anime_name = det.text
anime_link = det['href']
stars = anime.select("span")[3].text
print(anime_name, anime_link, stars, i)
i = i + 1
prints:
-Hitogatana- /title/540/hitogatana 4 1
-PIQUANT- /title/44134/piquant 5 2
-Rain- /title/37103/rain 4 3
-SINS- /title/1098/sins 4
:radical /title/46819/radical 1 5
:REverSAL /title/3877/reversal 3 6
... /title/52206/ 7
...Curtain. ~Sensei to Kiyoraka ni Dousei~ /title/7829/curtain-sensei-to-kiyoraka-ni-dousei 8
...Junai no Seinen /title/28947/junai-no-seinen 9
...no Onna /title/10162/no-onna 2 10
...Seishunchuu! /title/19186/seishunchuu 11
...Virgin Love /title/28945/virgin-love 12
.flow - Untitled (Doujinshi) /title/27292/flow-untitled-doujinshi 2 13
.gohan /title/50410/gohan 14
.hack//4koma + Gag Senshuken /title/7750/hack-4koma-gag-senshuken 24 15
.hack//Alcor - Hagun no Jokyoku /title/24375/hack-alcor-hagun-no-jokyoku 16
.hack//G.U.+ /title/7757/hack-g-u 1 17
.hack//GnU /title/7758/hack-gnu 18
.hack//Link - Tasogare no Kishidan /title/24374/hack-link-tasogare-no-kishidan 1 19
.hack//Tasogare no Udewa Densetsu /title/5817/hack-tasogare-no-udewa-densetsu 20
.hack//XXXX /title/7759/hack-xxxx 21
.traeH /title/9789/traeh 22
(G) Edition /title/886/g-edition 1 23
(Not) a Househusband /title/22832/not-a-househusband 6 24
(R)estauraNTR /title/37551/r-estaurantr 14 25
[ rain ] 1st Story /title/25587/rain-1st-story 3 26
[another] Xak /title/24881/another-xak 27
[es] ~Eternal Sisters~ /title/4879/es-eternal-sisters 1 28
and so on to 100...

Issues on dividing html part through 'tr' tag using Selenium Python

I tried to collect the data from this page (http://www.bobaedream.co.kr/mycar/popup/mycarChart_4.php?zone=C&cno=639137&tbl=cyber) using Selenium Python 3.6. What I tried to do is to divide the section into two and collect the data from each part.
The part is like below:
Those items in the two parts are made of 39 'tr' tags. I select 0 to 14th 'tr' tags for the first part and 15th to the end 'tr'tags for the second part. But the first part already called up to the last 'tr' tag. I don't understand why it happened.
Below is my code:
from bs4 import BeautifulSoup
import urllib.request
from urllib.parse import urlparse
from urllib.parse import quote
from selenium import webdriver
import re
import time
popup_inspection = "http://www.bobaedream.co.kr/mycar/popup/mycarChart_4.php?zone=C&cno=639137&tbl=cyber"
driver = webdriver.PhantomJS()
driver.set_window_size(500, 300)
driver.get(popup_inspection)
soup_inspection = BeautifulSoup(driver.page_source, "html.parser")
count = 0 # for loop count
count_insp = 0 # 누유 및 오작동
count_in = 0 # 골격
count_out = 0 # 외관
insp_tables = soup_inspection.find_all('table', class_=True)
for insp_table in insp_tables[4].find_all('tr'):
labels = insp_table.find_all('td', class_="center")
for label in labels[:15]:
if label.find("input", type="checkbox", checked=True):
count_out += 1
print (label.text)
else:
print(label.text)
print("외관 이상 수: ", count_out)
for label in labels[16:]:
if label.find("input", type="checkbox", checked=True):
count_in += 1
print (label.text)
else:
print(label.text)
print("골격 이상 수: ", count_in)
The result I would like to have is like below:
<Upper Part>
1 후드 0 0
2 프론트 휀더(좌) 0 0
......
8 트렁크 리드 1 0
Total : 1 0
<Lower Part>
1 프론트 패널
2 크로스 멤버
....
22 리어 패널 1 0
23 트렁크 플로어 0 0
Total : 1 0
Please help me to work this out.
Thanks.

Categories

Resources