Scrapping pdf links in dataframe from webpage using BeautifulSoup - python

I want to extract all the pdf links which takes us to the page directly from where we can download all the pdfs . I want to store these pdfs in a data frame
url = "https://www.volvogroup.com/en/news-and-media/press-releases.html"
source = requests.get(url)
soup = BeautifulSoup(source.text , "html.parser")
news_check = soup.find_all("a" , class_ = "articlelist__contentDownloadItem")
for i in news_check :
print(i)
break
data = set()
for i in soup.find_all('a'):
for j in i.find_all('href'):
pdf_link = "https://www.volvogroup.com" + j.get('.pdf')
data.add(j)
print(pdf_link)

You can try below code to get pdf link:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
url = "https://www.volvogroup.com/en/news-and-media/press-releases.html"
source = requests.get(url)
soup = bs(source.text , "html.parser")
news_check = soup.find_all("a" , class_ = "articlelist__contentDownloadItem")
data = set()
for i in news_check:
pdf_link ="https://www.volvogroup.com" + i['href']
data.add(pdf_link)
# for j in i.find_all('href'):
# pdf_link = + j.get('.pdf')
# data.add(j)
# print(pdf_link)
df = pd.DataFrame(data)
print(df)
Output :
0 https://www.volvogroup.com/content/dam/volvo-g...
1 https://www.volvogroup.com/content/dam/volvo-g...
2 https://www.volvogroup.com/content/dam/volvo-g...
3 https://www.volvogroup.com/content/dam/volvo-g...
4 https://www.volvogroup.com/content/dam/volvo-g...
5 https://www.volvogroup.com/content/dam/volvo-g...
6 https://www.volvogroup.com/content/dam/volvo-g...
7 https://www.volvogroup.com/content/dam/volvo-g...
8 https://www.volvogroup.com/content/dam/volvo-g...
9 https://www.volvogroup.com/content/dam/volvo-g...
10 https://www.volvogroup.com/content/dam/volvo-g...
11 https://www.volvogroup.com/content/dam/volvo-g...
12 https://www.volvogroup.com/content/dam/volvo-g...

Related

How to grab iframe data range from a page in python and beautifulsoup

I am trying to grab the iframe data of a page. I found some resources that works but I cant make it work to get the data I wanted.
import requests
from bs4 import BeautifulSoup
s = requests.Session()
r = s.get("https://bscscan.com/token/0xe56842ed550ff2794f010738554db45e60730371#balances")
soup = BeautifulSoup(r.content, "html.parser")
iframe_src = soup.select_one("#tokeholdersiframe").attrs["src"]
r = s.get(f"https:{iframe_src}")
soup = BeautifulSoup(r.content, "html.parser")
for row in rowsblockdetails[1:]:
rank = row.find_all('td')[0].text[0:].strip()
address = row.find_all('td')[1].text[0:].strip()
amount = row.find_all('td')[2].text[0:].strip()
percentage = row.find_all('td')[3].text[0:]
print (" {:<3} {:<25} {:>15} {:>10} ".format(rank, address, amount, percentage))
Current Output: # Errors
Traceback (most recent call last):
r = s.get(f"https:{iframe_src}")
return self.request('GET', url, **kwargs)
raise InvalidURL("Invalid URL %r: No host supplied" % url)
Wanted Output:
1 UniCrypt: Token Vesting 150,000,000 15.0451% Contract
2 PancakeSwap V2: BIN 17 77,320,752.850881264572940617 8.1141% Contract
3 0xa36b9dc17e421d86ddf8e490dafa87344e76125b 49,463,154.04616156547917712 4.9612%
4 0xbbda05ea467ad348212dade5c38c11910c14e83e 48,704,064.094074959661726945 4.8769%
5 0xcfdb8569fb546a010bb22b5057679c4053d4a231 11,493,129.656390775184191781 1.1528%
You can get the iframe HTML directly, all you need is the address:
import requests
from bs4 import BeautifulSoup
s = requests.Session()
iframe_src = "https://bscscan.com/token/generic-tokenholders2?m=normal&a=0xe56842ed550ff2794f010738554db45e60730371"
r = s.get(iframe_src)
soup = BeautifulSoup(r.content, "html.parser")
for row in soup.select("tr:has(td)"):
rank = row.find_all("td")[0].text[0:].strip()
address = row.find_all("td")[1].text[0:].strip()
amount = row.find_all("td")[2].text[0:].strip()
percentage = row.find_all("td")[3].text[0:]
print(
" {:<3} {:<45} {:^35} {:>10} ".format(rank, address, amount, percentage)
)
Prints:
1 UniCrypt: Token Vesting 150,000,000 0.0000%
2 PancakeSwap V2: BIN 17 76,926,258.749406306830460162 0.0000%
3 0xa36b9dc17e421d86ddf8e490dafa87344e76125b 49,463,154.04616156547917712 0.0000%
4 0xbbda05ea467ad348212dade5c38c11910c14e83e 48,770,394.148172297052962199 0.0000%
5 0xcfdb8569fb546a010bb22b5057679c4053d4a231 11,493,129.656390775184191781 0.0000%
6 0xe56842ed550ff2794f010738554db45e60730371 10,236,437.028812018664646028 0.0000%
7 0x1a1db1616854b4fe4723925f7129188483500eb6 10,000,000 0.0000%
8 0xa8b398896d67cea6d26fc140e056f745261c4b00 9,024,167.759368544603338806 0.0000%
9 0x0d9b1e53cbb251572d982d9f96520e8d40d22bb0 7,200,000 0.0000%
10 0x934f895383a6eb7d8a8cfd6c894f7fb57ad5f2b1 7,078,575.994558878120028183 0.0000%
11 0x739621ea040cf397a169047df2935458c6502d63 7,000,000 0.0000%
12 0xc7129c10f8056986716effffbbe0f1e9c80622d8 5,307,909.106163478741030017 0.0000%
13 0x43e5959343cd9154080c235c16fbb4bbd7f83e70 5,215,489.453108218359291854 0.0000%
14 0xa7a9544d86066bf583be602195536918497b1fff 5,000,000 0.0000%
15 0x81da471feb4a45438053dc05e709be056ec26c39 4,900,400 0.0000%
16 0xc68446c2a2e03e932774c3353629b9979b380c72 4,846,096.854970087386140103 0.0000%
17 0xb6957013a430e4cf509a4c51002073c1b24356e2 4,641,857.889152974553322217 0.0000%
18 0x4b0005c7bba3e10820b5b3a2863821e00701b383 4,570,260.896212362994501438 0.0000%
19 0xe02752824b6b11e027080e75f692bd22b3dc7091 4,388,894.363703154394892711 0.0000%
...and so on.

Webscraping coordinates of a polygon with Python and BeautifulSoup

I'm trying to scrape information from this webpage and many similar, https://knowyourcity.info/settlement/1846/5119249
When viewing the page source the coordinates for the polygon at the top of the page are available but not when inspecting the polygon element. Would anyone know how to scrape these coordinates into a column of a dataframe using BeautifulSoup package in python?
This is the code I used to access the website
from requests import get
url = 'http://knowyourcity.info/settlement/1846/5119249'
response = get(url)
print(response.text[:500])
from bs4 import BeautifulSoup
html_soup = BeautifulSoup(response.text, "html.parser")
type(html_soup)
It looks like the map is driven by the variable settlement. Therefore one option
is to loop through all the scripts tags and search for var settlement. Once you've found the variable, use simple find and subscribing to get the variable data. Convert this to json and then return the boundaries.
The example is for illustration purposes. You'll most likely want to refactor the code:
from requests import get
from bs4 import BeautifulSoup
import json
def getHtml():
url = 'http://knowyourcity.info/settlement/1846/5119249'
response = get(url)
return response.text
def extractBoundaries(html):
html_soup = BeautifulSoup(html, "html.parser")
scripts = html_soup.find_all('script')
for script in scripts:
startFind = "var settlement = "
endFind = "};"
if script.contents and startFind in script.contents[0]:
scriptText = script.contents[0]
startIndex = scriptText.find(startFind) + len(startFind)
endIndex = scriptText.find(endFind) + len(endFind) -1
settlementData = scriptText[startIndex:endIndex]
jsonData = json.loads(settlementData)
return jsonData['verification/A0_Boundary']
html = getHtml()
results = extractBoundaries(html)
print(results)
Output:
5.599769999885609 -0.224459999729163 0 0;5.599920830581937 -0.2235293057328249 0 0;5.600343984087658 -0.2220772405721618 0 0;5.600582171330188 -0.2212706242398781 0 0;5.600757735181389 -0.2203650797845285 0 0;5.600943331869303 -0.2195227513738018 0 0;5.601229999764712 -0.2178069995933356 0 0;5.601684627743396 -0.2160719483616731 0 0;5.602178000314495 -0.215115999603654 0 0;5.60277082980997 -0.213977987593978 0 0;5.60322584449716 -0.2131045282513355 0 0;5.603939996133988 -0.2117290691411995 0 0;5.604261867990886 -0.2111080629866819 0 0;5.604746000027944 -0.210174000129939 0 0;5.605512212518647 -0.208745954062465 0 0;5.605957084651777 -0.2079168151088879 0 0;5.60642700020594 -0.2070410004417909 0 0;5.606837000227415 -0.2063009995914058 0 0;5.607503034537444 -0.2072989224072899 0 0;5.608332999968013 -0.2085879998362543 0 0;5.608940827457275 -0.2094694811315776 0 0;5.609384837140567 -0.2101133921192968 0 0;5.609949999892649 -0.210933000057878 0 0;5.610520744736618 -0.2114266172445696 0 0;5.61105999981919 -0.2118930002616821 0 0;5.612419000436546 -0.2126160003281257 0 0;5.613144659798252 -0.2126897915006225 0 0;5.614907000058054 -0.2128690003040674 0 0;5.615398000217567 -0.2144450001366067 0 0;5.615173904452149 -0.2159211302559356 0 0;5.614935501372315 -0.2174915048290131 0 0;5.61470415976919 -0.2190153628686744 0 0;5.614495076386731 -0.2203926071330784 0 0;5.61425499966856 -0.2219740001999071 0 0;5.613865981729703 -0.2233052558328268 0 0;5.613273865396593 -0.2253315354219581 0 0;5.612689000297166 -0.227333000017893 0 0;5.611838309990048 -0.2274067552175438 0 0;5.611219650166788 -0.2272163984180224 0 0;5.610458222968646 -0.2271212195685735 0 0;5.609547010985807 -0.2272079061199293 0 0;5.608730734136145 -0.2266937097468826 0 0;5.607481517358167 -0.2262178181977106 0 0;5.605377060602905 -0.2259990644052436 0 0;5.603420000032998 -0.2258499999774699 0 0;5.602499999875136 -0.2257000002547329 0 0;5.601491149397077 -0.225320574484897 0 0;5.599769999885609 -0.224459999729163 0 0

Is there any way to get the cookies and cache of a visited website from chrome to beautifulsoup in Python?

I want to scrape a certain website weather data but the default page layout gives max of 40 results but when layout changed to simple list gives 100 results and the layout is set to default which is difficult to achieve with selenium. Is there any way to get the cookies saved in chrome to be used with beautiful soup.
import requests
from bs4 import BeautifulSoup
import browser_cookie3
cj = browser_cookie3.load()
s = requests.Session()
url = "https:/something.org/titles/2"
i=1
print(cj)
for c in cj:
if 'mangadex' in str(c):
s.cookies.set_cookie(c)
r = s.get(url)
soup = BeautifulSoup(r.content, 'lxml')
for anime in soup.find_all('div', {'class': 'manga-entry col-lg-6 border-bottom pl-0 my-1'}):
det = anime.find('a', {"class": "ml-1 manga_title text-truncate"})
anime_name = det.text
anime_link = det['href']
stars = anime.select("span")[3].text
print(anime_name, anime_link, stars,i)
i=i+1
Try:
import browser_cookie3
import requests
cj = browser_cookie3.load()
s = requests.Session()
for c in cj:
if 'sitename' in str(c):
s.cookies.set_cookie(c)
r = s.get(the_site)
This code use the browsers cookies in the requests module in as Session. Simply change sitename to the site you want cookies from.
Your new code:
import requests
from bs4 import BeautifulSoup
import browser_cookie3
cj = browser_cookie3.load()
s = requests.Session()
url = "https://something.org/titles/2"
i = 1
print(cj)
for c in cj:
if 'mangadex' in str(c):
s.cookies.set_cookie(c)
r = s.get(url)
soup = BeautifulSoup(r.content, 'lxml')
for anime in soup.find_all('div', {'class': 'manga-entry row m-0 border-bottom'}):
det = anime.find('a', {"class": "ml-1 manga_title text-truncate"})
anime_name = det.text
anime_link = det['href']
stars = anime.select("span")[3].text
print(anime_name, anime_link, stars, i)
i = i + 1
prints:
-Hitogatana- /title/540/hitogatana 4 1
-PIQUANT- /title/44134/piquant 5 2
-Rain- /title/37103/rain 4 3
-SINS- /title/1098/sins 4
:radical /title/46819/radical 1 5
:REverSAL /title/3877/reversal 3 6
... /title/52206/ 7
...Curtain. ~Sensei to Kiyoraka ni Dousei~ /title/7829/curtain-sensei-to-kiyoraka-ni-dousei 8
...Junai no Seinen /title/28947/junai-no-seinen 9
...no Onna /title/10162/no-onna 2 10
...Seishunchuu! /title/19186/seishunchuu 11
...Virgin Love /title/28945/virgin-love 12
.flow - Untitled (Doujinshi) /title/27292/flow-untitled-doujinshi 2 13
.gohan /title/50410/gohan 14
.hack//4koma + Gag Senshuken /title/7750/hack-4koma-gag-senshuken 24 15
.hack//Alcor - Hagun no Jokyoku /title/24375/hack-alcor-hagun-no-jokyoku 16
.hack//G.U.+ /title/7757/hack-g-u 1 17
.hack//GnU /title/7758/hack-gnu 18
.hack//Link - Tasogare no Kishidan /title/24374/hack-link-tasogare-no-kishidan 1 19
.hack//Tasogare no Udewa Densetsu /title/5817/hack-tasogare-no-udewa-densetsu 20
.hack//XXXX /title/7759/hack-xxxx 21
.traeH /title/9789/traeh 22
(G) Edition /title/886/g-edition 1 23
(Not) a Househusband /title/22832/not-a-househusband 6 24
(R)estauraNTR /title/37551/r-estaurantr 14 25
[ rain ] 1st Story /title/25587/rain-1st-story 3 26
[another] Xak /title/24881/another-xak 27
[es] ~Eternal Sisters~ /title/4879/es-eternal-sisters 1 28
and so on to 100...

re-iterate over and over rather than once in soup

I keep re-iterating over this code. I'm keen to scrape all past results data from this site yet i keep looping over one by one?
for example race_number printed goes 1, 1,2, 1,2,3 etc etc
End goal is to full all list with data and panda it out to look at results and trends.
import requests
import csv
import os
import numpy
import pandas
from bs4 import BeautifulSoup as bs
with requests.Session() as s:
webpage_response = s.get('http://www.harness.org.au/racing/fields/race-fields/?mc=SW010420')
soup = bs(webpage_response.content, "html.parser")
#soup1 = soup.select('.content')
results = soup.find_all('div', {'class':'forPrint'})
race_number = []
race_name = []
race_title = []
race_distance = []
place = []
horse_name = []
Prizemoney = []
Row = []
horse_number = []
Trainer = []
Driver = []
Margin = []
Starting_odds = []
Stewards_comments = []
Scratching = []
Track_Rating = []
Gross_Time = []
Mile_Rate = []
Lead_Time = []
First_Quarter = []
Second_Quarter = []
Third_Quarter = []
Fourth_Quarter = []
for race in results:
race_number1 = race.find(class_='raceNumber').get_text()
race_number.append(race_number1)
race_name1 = race.find(class_='raceTitle').get_text()
race_name.append(race_name1)
race_title1 = race.find(class_='raceInformation').get_text(strip=True)
race_title.append(race_title1)
race_distance1 = race.find(class_='distance').get_text()
race_distance.append(race_distance1)
Need help fixing iteration over and over, and what is the next best move to look at table data rather than headers above?
Cheers
Is this the output you are expecting:
import requests
import csv
import os
import numpy
import pandas as pd
import html
from bs4 import BeautifulSoup as bs
with requests.Session() as s:
webpage_response = s.get('http://www.harness.org.au/racing/fields/race-fields/?mc=SW010420')
soup = bs(webpage_response.content, "html.parser")
#soup1 = soup.select('.content')
data = {}
data["raceNumber"] = [ i['rowspan'] for i in soup.find_all("td", {"class": "raceNumber", "rowspan": True})]
data["raceTitle"] = [ i.get_text(strip=True) for i in soup.find_all("td", {"class": "raceTitle"})]
data["raceInformation"] = [ i.get_text(strip=True) for i in soup.find_all("td", {"class": "raceInformation"})]
data["distance"] = [ i.get_text(strip=True) for i in soup.find_all("td", {"class": "distance"})]
print(data)
data_frame = pd.DataFrame(data)
print(data_frame)
## Output
## raceNumber raceTitle raceInformation distance
##0 3 PREMIX KING PACE $4,500\n\t\t\t\t\t4YO and older.\n\t\t\t\t\tNR... 1785M
##1 3 GATEWAY SECURITY PACE $7,000\n\t\t\t\t\t4YO and older.\n\t\t\t\t\tNR... 2180M
##2 3 PERRY'S FOOTWEAR TROT $7,000\n\t\t\t\t\t\n\t\t\t\t\tNR 46 to 55.\n\t... 2180M
##3 3 DELAHUNTY PLUMBING 3YO TROT $7,000\n\t\t\t\t\t3YO.\n\t\t\t\t\tNR 46 to 52.... 2180M
##4 3 RAYNER'S FRUIT & VEGETABLES 3YO PACE $7,000\n\t\t\t\t\t3YO.\n\t\t\t\t\tNR 48 to 56.... 2180M
##5 3 KAYE MATTHEWS TRIBUTE $9,000\n\t\t\t\t\t4YO and older.\n\t\t\t\t\tNR... 2180M
##6 3 TALQUIST TREES PACE $7,000\n\t\t\t\t\t\n\t\t\t\t\tNR 62 to 73.\n\t... 2180M
##7 3 WEEKLY ADVERTISER 3WM PACE $7,000\n\t\t\t\t\t\n\t\t\t\t\tNR 56 to 61.\n\t... 1785M

Issues on dividing html part through 'tr' tag using Selenium Python

I tried to collect the data from this page (http://www.bobaedream.co.kr/mycar/popup/mycarChart_4.php?zone=C&cno=639137&tbl=cyber) using Selenium Python 3.6. What I tried to do is to divide the section into two and collect the data from each part.
The part is like below:
Those items in the two parts are made of 39 'tr' tags. I select 0 to 14th 'tr' tags for the first part and 15th to the end 'tr'tags for the second part. But the first part already called up to the last 'tr' tag. I don't understand why it happened.
Below is my code:
from bs4 import BeautifulSoup
import urllib.request
from urllib.parse import urlparse
from urllib.parse import quote
from selenium import webdriver
import re
import time
popup_inspection = "http://www.bobaedream.co.kr/mycar/popup/mycarChart_4.php?zone=C&cno=639137&tbl=cyber"
driver = webdriver.PhantomJS()
driver.set_window_size(500, 300)
driver.get(popup_inspection)
soup_inspection = BeautifulSoup(driver.page_source, "html.parser")
count = 0 # for loop count
count_insp = 0 # 누유 및 오작동
count_in = 0 # 골격
count_out = 0 # 외관
insp_tables = soup_inspection.find_all('table', class_=True)
for insp_table in insp_tables[4].find_all('tr'):
labels = insp_table.find_all('td', class_="center")
for label in labels[:15]:
if label.find("input", type="checkbox", checked=True):
count_out += 1
print (label.text)
else:
print(label.text)
print("외관 이상 수: ", count_out)
for label in labels[16:]:
if label.find("input", type="checkbox", checked=True):
count_in += 1
print (label.text)
else:
print(label.text)
print("골격 이상 수: ", count_in)
The result I would like to have is like below:
<Upper Part>
1 후드 0 0
2 프론트 휀더(좌) 0 0
......
8 트렁크 리드 1 0
Total : 1 0
<Lower Part>
1 프론트 패널
2 크로스 멤버
....
22 리어 패널 1 0
23 트렁크 플로어 0 0
Total : 1 0
Please help me to work this out.
Thanks.

Categories

Resources