im trying to get all the labels (text) from the boxes that are checked off (or questions answered) on the following site.
however i seem to not get any text out instead.
Further more the way i thought of doing the scraping, was to collect all the links first - in the right side you can switch between the pages. It also seems like this list have all links times 2...
Here is my current code (see link in there as well called main_url)
import bs4 as bs
from splinter import Browser
import time
executable_path = {'executable_path' :'C:/users/chromedriver.exe'}
browser = Browser('chrome', **executable_path)
main_url = 'https://reporting.unpri.org/surveys/PRI-Reporting-Framework-
2016/0ad07cdc-cfbc-4c5b-a79f-
2b07e93d8521/79894dbc337a40828d895f9402aa63de/html/2/?lang=&a=1'
browser.visit(main_url)
source = browser.html
soup = bs.BeautifulSoup(source, 'lxml')
base_url = main_url[:-51]
urls = []
print(base_url)
for i in soup.find_all('div', class_ = 'accordion-inner n-accordion-link'):
for j in soup.find_all('a', class_ = 'tooltiper'):
urls.append(j['href'])
print(urls)
result = []
for k in urls:
ext = k[8:]
browser.visit(base_url + ext)
source1 = browser.html
soup1 = bs.BeautifulSoup(source1, 'lxml')
temp_list = []
print(browser.url)
for img in soup1.find_all('img', class_ = 'readradio'):
for t in img['src']:
if t == '/Style/img/checkedradio.png':
for x in soup1.find_all('span', class_ = 'title'):
txt = str(x.string)
temp_list.append(txt)
result.append(temp_list)
print(result)
I get the following output for the results list, which is supposed to contain the text:
[[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]
Updated code with suggestion:
import bs4 as bs
from splinter import Browser
import time
executable_path = {'executable_path'
:'/users/nichlasrasmussen/documents/webdrivers/phantomjs'}
browser = Browser('phantomjs', **executable_path)
main_url = 'https://reporting.unpri.org/surveys/PRI-Reporting-Framework-
2016/0ad07cdc-cfbc-4c5b-a79f-
2b07e93d8521/79894dbc337a40828d895f9402aa63de/html/2/?lang=&a=1'
browser.visit(main_url)
source = browser.html
soup = bs.BeautifulSoup(source, 'lxml')
base_url = main_url[:-51]
urls = []
print(base_url)
for i in soup.find_all('div', class_ = 'accordion-inner n-accordion-link'):
for j in soup.find_all('a', class_ = 'tooltiper'):
urls.append(j['href'])
print(urls)
result = []
for k in urls:
ext = k[8:]
browser.visit(base_url + ext)
source1 = browser.html
soup1 = bs.BeautifulSoup(source1, 'lxml')
temp_list = []
print(browser.url)
for label in soup1.find_all('label', class_='radio'):
t = label.find('img', class_='readradio')
if 'checkedradio' in t['src']:
content = soup1.find('span', class_='title')
temp_list.append(content.text)
result.append(temp_list)
print(result)
You can basically just refer to the parent of both the img and span.title elements which is the label.radio.
No need to do a tremendous loop starting from root (soup1)
Try this:
for label in soup1.find_all('label', class_='radio'):
t = label.find('img', class_='readradio')
if t and '/Style/img/checkedradio.png' in t.get('src'):
content = label.find('span', class_='title')
temp_list.append(content.text)
Related
in short, i can't get the links of "href" attribute from this link (a turkish online book and related stuff seller).
here's my code (i know it's not the best, i'm learning python for a few months online, so any heads up for best practices also welcomed)
i tried to get book names, writers, prices, publishers and the links for each book; without links it's working as i expected.
import requests
import pandas as pd
from bs4 import BeautifulSoup
from time import sleep
from random import randint
yazar = []
fiyat = []
yayın = []
isim = []
for i in range(1,10):
url = "https://www.dr.com.tr/CokSatanlar/Kitap#/page="+str(i)
page = requests.get(url)
soup = BeautifulSoup(page.text, "lxml")
# book names
k = soup.find_all("a", {"class":"prd-name"})
for i in k:
isim.append(i.text)
# writer names
y = soup.find_all("a", {"class":"who text-overflow"})
for i in y:
yazar.append(i.text)
# prices
f = soup.find_all("div", {"class":"prd-price"})
for i in f:
fiyat.append(i.text.split()[0])
# publishers
ye = soup.find_all("a", {"class":"prd-publisher"})
for i in ye:
yayın.append(i.get("title"))
sleep(randint(2, 4))
however when i try to get links
soup.find_all("a", {"class":"prd-name"}).get("href")
it turns none and i couldn't manage to make this work whatever i tried.
thank you all in advance and sorry for a little longer than usual post.
The data you see on the page is loaded from external location, so you need other URL to get correct data:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = "https://www.dr.com.tr/Catalog/CatalogProducts"
data = {
"catalogId": "4020",
"page": "1",
"sortfield": "soldcount",
"sortorder": "desc",
"size": "60",
"categoryid": "0",
"parentId": "0",
"mediatypes": "",
"HideNotForSale": "true",
"minPrice": "-1",
"maxPrice": "-1",
"writer": "",
"minDiscount": "-1",
"maxdiscount": "-1",
"language": "",
}
all_data = []
for page in range(1, 3): # <-- increase number of pages here
print(f"Getting page {page}")
data["page"] = page
soup = BeautifulSoup(requests.post(url, data=data).content, "html.parser")
for p in soup.select(".prd-content"):
all_data.append(p.get_text(strip=True, separator="|").split("|")[:5])
df = pd.DataFrame(
all_data, columns=["name", "autor", "price", "type", "publisher"]
)
print(df)
df.to_csv("data.csv", index=False)
Prints:
name autor price type publisher
0 Esra Ezmeci Seti 5 Kitap Takım - Defter Hediyeli Esra Ezmeci 155,45 TL İnce Kapak Destek Yayınları
1 Şimdi Onlar Düşünsün Bircan Yıldırım 36,20 TL İnce Kapak Destek Yayınları
2 İz Bıraktığın Kadar Varsın Esra Ezmeci 36,20 TL İnce Kapak Destek Yayınları
...
and saves data.csv (screenshot from Libre Office):
Think you wont get a None you will get:
AttributeError: ResultSet object has no attribute 'get'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?
find_all() produces a ResultSet, so you have to iterate it to get all the href:
for a in soup.find_all("a", {"class":"prd-name"}):
print('https://www.dr.com.tr'+a.get("href"))
Output
https://www.dr.com.tr/kitap/daha-adil-bir-dunya-mumkun/arastirma-tarih/politika-arastirma/turkiye-politika-/urunno=0001934858001
https://www.dr.com.tr/kitap/burasi-cok-onemli-enerjiden-ekonomiye-tam-bagimsiz-turkiye/arastirma-tarih/politika-arastirma/turkiye-politika-/urunno=0001966362001
https://www.dr.com.tr/kitap/iz-biraktigin-kadar-varsin/egitim-basvuru/psikoloji-bilimi/urunno=0001947472001
https://www.dr.com.tr/kitap/simdi-onlar-dusunsun/bircan-yildirim/egitim-basvuru/kisisel-gelisim/urunno=0001964436001
https://www.dr.com.tr/kitap/kadinlar-sicak-erkekler-soguk-sever/esra-ezmeci/egitim-basvuru/psikoloji-bilimi/urunno=0001904239001
https://www.dr.com.tr/kitap/dustugunde-kalkarsan-hayat-guzeldir/egitim-basvuru/psikoloji-bilimi/urunno=0001816754001
...
I have three lists emojiLink, emojiTitle, emojiDescription in my code below.
from bs4 import BeautifulSoup
import pandas as pd
r = requests.get("https://www.emojimeanings.net/list-smileys-people-whatsapp")
soup = BeautifulSoup(r.text, "lxml")
emojiLink = []
emojiTitle = []
emojiDescription = []
for tableRow in soup.find_all("tr", attrs={"class": "ugc_emoji_tr"}):
for img in tableRow.findChildren("img"):
emojiLink.append(img['src'])
for tableData in soup.find_all("td"):
for boldTag in tableData.findChildren("b"):
emojiTitle.append(boldTag.text)
for tableRow in soup.find_all("tr", attrs={"class": "ugc_emoji_tr"}):
for tabledata in tableRow.findChildren("td"):
if tabledata.has_attr("id"):
k = tabledata.text.strip().split('\n')[-1]
l = k.lstrip()
emojiDescription.append(l)
I want to convert these lists into a Json object which gonna look like...
{{"link": "emojiLink[0]", "title": "emojiTitle[0]", "desc": "emojiDescription[0]"},{"link": "emojiLink[1]", "title": "emojiTitle[1]", "desc": "emojiDescription[1]"}..........} so on...
I am not getting to how to do this?
THANKS IN ADVANCE!!!
This returns an array of JSON objects based off of Chandella07's answer.
from bs4 import BeautifulSoup
import pandas as pd
import requests
import json
r = requests.get("https://www.emojimeanings.net/list-smileys-people-whatsapp")
soup = BeautifulSoup(r.text, "lxml")
emojiLinkList = []
emojiTitleList = []
emojiDescriptionList = []
jsonData = []
for tableRow in soup.find_all("tr", attrs={"class": "ugc_emoji_tr"}):
for img in tableRow.findChildren("img"):
emojiLinkList.append(img['src'])
for tableData in soup.find_all("td"):
for boldTag in tableData.findChildren("b"):
emojiTitleList.append(boldTag.text)
for tableRow in soup.find_all("tr", attrs={"class": "ugc_emoji_tr"}):
for tabledata in tableRow.findChildren("td"):
if tabledata.has_attr("id"):
k = tabledata.text.strip().split('\n')[-1]
l = k.lstrip()
emojiDescriptionList.append(l)
for link, title, desc in zip(emojiLinkList, emojiTitleList, emojiDescriptionList):
dict = {"link": link, "title": title, "desc": desc}
jsonData.append(dict)
print(json.dumps(jsonData, indent=2))
Data Example:
{
"link": "https://www.emojimeanings.net/img/emojis/purse_1f45b.png",
"title": "Wallet",
"desc": "After the shopping trip, the money has run out or the wallet was forgotten at home. The accessory keeps loose money but also credit cards or make-up. Can refer to shopping or money and stand for femininity and everything girlish."
},
One by one access each element from list and put it into some dict and at the end append to a list:
import json
# some example lists
em_link = ['a', 'b', 'c']
em_title = ['x', 'y', 'z']
em_desc = [1,2,3]
arr = []
for i,j,k in zip(em_link, em_title, em_desc):
d = {}
d.update({"link": i})
d.update({"title": j})
d.update({"desc": k})
arr.append(d)
print(json.dumps(arr))
Output:
[{"link": "a", "title": "x", "desc": 1}, {"link": "b", "title": "y", "desc": 2}, {"link": "c", "title": "z", "desc": 3}]
There is something wrong with your dict format. {{...},{...}} is not a valid format, [{...},{...}] is valid.
Regarding the merging logic:
for i in zip([1,2,3], ["a", "b"], [8,9,10]):
print(i)
... will output ...
(1, 'a', 8)
(2, 'b', 9)
Try something like that:
out = []
for i in zip(emojiLink, emojiTitle, emojiDescription):
out.append({"link": i[0], ...})
You can use the json library to read/write in json format.
import json
with open('./smthn.json', 'w') as f:
json.dump({"a": "dictionary"}, f)
https://devtut.github.io/python/json-module.html#storing-data-in-a-file
So, you want a list of dictionary records? If you're sure all of the lists are the same length, you can do:
gather = []
for l,t,d in zip(emojiLink,emojiTitle,emojiDescription):
gather.append( {"link":l, "title":t, "desc":d} )
json.dump( gather, open("myrecord.json","w") )
I am trying to parse an html page with BeautifulSoup. The task is to get the data underlined with red color for all the lots on this page. I got the data from the left and the right block (about the lot, auction name, country etc) but getting the data from the central block seems to be problematic for me. Here is the example of what is done.
import requests
import re
from bs4 import BeautifulSoup as bs
import pandas as pd
URL_TEMPLATE = "https://www.artprice.com/artist/15079/wassily-kandinsky/lots/pasts?ipp=100"
FILE_NAME = "test"
def parse(url = URL_TEMPLATE):
result_list = {'lot': [], 'name': [], 'date': [], 'type1': [], 'type2': [], 'width': [], 'height': [], 'estimate': [], 'hummerprice': [], 'auction_date': [], 'auction': [], 'country': []}
r = requests.get(URL_TEMPLATE)
soup = bs(r.text, "html.parser")
lot_info = soup.find_all('p', class_='hidden-xs')
date_info = soup.find_all('date')
names_info = soup.find_all('a', class_='sln_lot_show')
auction_info = soup.find_all('p', class_='visible-xs')
auction_date_info = soup.find_all(string=re.compile('\d\d\s\w\w\w\s\d\d\d\d'))[1::2]
type1_info = soup.find_all('div')
for i in range(len(lot_info)):
result_list['lot'].append(lot_info[i].text)
for i in range(len(date_info)):
result_list['date'].append(date_info[i].text)
for i in range (len(names_info)):
result_list['name'].append(names_info[i].text)
for i in range(0, len(auction_info), 2):
result_list['auction'].append(soup.find_all('p', class_='visible-xs')[i].strong.string)
for i in range(1, len(auction_info), 2):
result_list['country'].append(soup.find_all('p', class_='visible-xs')[i].string)
for i in range(len(auction_date_info)):
result_list['auction_date'].append(auction_date_info[i])
return result_list
df = pd.DataFrame(data=parse())
df.to_excel("test.xlsx")
So, the task is to get the data from the central block separately for each lot on this page.
You need nth-of-type to access all those <p> elements.
This does it for just the first one to show that it works.
I'll leave it to you to clean up the output.
for div in soup.find_all('div',class_='col-xs-8 col-sm-6'):
print(div.select_one('a').text.strip())
print(div.select_one('p:nth-of-type(2)').text.strip())
print(div.select_one('p:nth-of-type(3)').text.strip())
print(div.select_one('p:nth-of-type(4)').text.strip())
break
Result:
Abstract
Print-Multiple, Print in colors, 29 1/2 x 31 1/2 in75 x 80 cm
Estimate:
€ 560 - € 784
$ 605 - $ 848
£ 500 - £ 700
¥ 4,303 - ¥ 6,025
Hammer price:
not communicated
not communicated
not communicated
not communicated
I want to try get the data from this website http://wiki.stat.ucla.edu/socr/index.php/SOCR_Data_Dinov_020108_HeightsWeights using beautiful soup and requests. Here is my code:
import requests
from bs4 import BeautifulSoup
response = requests.get("http://wiki.stat.ucla.edu/socr/index.php/SOCR_Data_Dinov_020108_HeightsWeights")
soup = BeautifulSoup(response.text, "html.parser")
list_table_data = soup.find(class_="wikitable").contents
list_tr_data = list_table_data[1::2]
print(list_tr_data)
when you print list_tr_data the output become:
[<tr>
<th>Index</th><th>Height(Inches)</th><th>Weight(Pounds)
</th></tr>, <tr>
<td>1</td><td>65.78</td><td>112.99
</td></tr>, <tr>
<td>2</td><td>71.52</td><td>136.49
</td></tr>, <tr>
<td>3</td><td>69.40</td><td>153.03
</td></tr>,...., <tr>
<td>200</td><td>71.39</td><td>127.88
</td></tr>]
I want this Height(Inches) data into a list called list_height_data, but when I trying to access using this code:
list_height_data = []
for row in list_tr_data:
list_height_data.append(row.find_all("tr"))
print(list_height_data)
this cause an empty list:
[[], [], [], [], [], [], [], [], [], [], ... []]
what should I do to get height(inches) data? If you print list_height_data and print len(list_height_data) should become:
[65.78, 71.52, 69.40, ..., 71.39]
200
You need to iterate over the td tags:
import requests
from bs4 import BeautifulSoup as soup
d = soup(requests.get('http://wiki.stat.ucla.edu/socr/index.php/SOCR_Data_Dinov_020108_HeightsWeights').text, 'html.parser')
_, *results = [[float(c.text.replace('\n', '')) for c in i.find_all('td')] for i in d.find('table', {'class':'wikitable'}).find_all('tr')]
height = [i[1] for i in results]
Output:
[65.78, 71.52, 69.4, 68.22, 67.79, 68.7, 69.8, 70.01, 67.9, 66.78, 66.49, 67.62, 68.3, 67.12, 68.28, 71.09, 66.46, 68.65, 71.23, 67.13, 67.83, 68.88, 63.48, 68.42, 67.63, 67.21, 70.84, 67.49, 66.53, 65.44, 69.52, 65.81, 67.82, 70.6, 71.8, 69.21, 66.8, 67.66, 67.81, 64.05, 68.57, 65.18, 69.66, 67.97, 65.98, 68.67, 66.88, 67.7, 69.82, 69.09, 69.91, 67.33, 70.27, 69.1, 65.38, 70.18, 70.41, 66.54, 66.36, 67.54, 66.5, 69.0, 68.3, 67.01, 70.81, 68.22, 69.06, 67.73, 67.22, 67.37, 65.27, 70.84, 69.92, 64.29, 68.25, 66.36, 68.36, 65.48, 69.72, 67.73, 68.64, 66.78, 70.05, 66.28, 69.2, 69.13, 67.36, 70.09, 70.18, 68.23, 68.13, 70.24, 71.49, 69.2, 70.06, 70.56, 66.29, 63.43, 66.77, 68.89, 64.87, 67.09, 68.35, 65.61, 67.76, 68.02, 67.66, 66.31, 69.44, 63.84, 67.72, 70.05, 70.19, 65.95, 70.01, 68.61, 68.81, 69.76, 65.46, 68.83, 65.8, 67.21, 69.42, 68.94, 67.94, 65.63, 66.5, 67.93, 68.89, 70.24, 68.27, 71.23, 69.1, 64.4, 71.1, 68.22, 65.92, 67.44, 73.9, 69.98, 69.52, 65.18, 68.01, 68.34, 65.18, 68.26, 68.57, 64.5, 68.71, 68.89, 69.54, 67.4, 66.48, 66.01, 72.44, 64.13, 70.98, 67.5, 72.02, 65.31, 67.08, 64.39, 69.37, 68.38, 65.31, 67.14, 68.39, 66.29, 67.19, 65.99, 69.43, 67.97, 67.76, 65.28, 73.83, 66.81, 66.89, 65.74, 65.98, 66.58, 67.11, 65.87, 66.78, 68.74, 66.23, 65.96, 68.58, 66.59, 66.97, 68.08, 70.19, 65.52, 67.46, 67.41, 69.66, 65.8, 66.11, 68.24, 68.02, 71.39]
Hope you are all well! I'm new and using Python 2.7! I'm tring to extract emails from a public available directory website that does not seems to have API: this is the site: http://www.tecomdirectory.com/companies.php?segment=&activity=&search=category&submit=Search
, the code stop gathering email where on the page at the bottom where it says "load more"!
Here is my code:
import requests
import re
from bs4 import BeautifulSoup
file_handler = open('mail.txt','w')
soup = BeautifulSoup(requests.get('http://www.tecomdirectory.com/companies.php?segment=&activity=&search=category&submit=Search').content)
tags = soup('a')
list_new =[]
for tag in tags:
if (re.findall(r'href="mailto:([^"#]+#[^"]+)">\1</a>',('%s'%tag))): list_new = list_new +(re.findall(r'href="mailto:([^"#]+#[^"]+)">\1</a>', ('%s'%tag)))
for x in list_new:
file_handler.write('%s\n'%x)
file_handler.close()
How can i make sure that the code goes till the end of the directory and does not stop where it shows load more?
Thanks.
Warmest regards
You just need to post some data, in particular incrementing group_no to simulate clicking the load more button:
from bs4 import BeautifulSoup
import requests
# you can set whatever here to influence the results
data = {"group_no": "1",
"search": "category",
"segment": "",
"activity": "",
"retail": "",
"category": "",
"Bpark": "",
"alpha": ""}
post = "http://www.tecomdirectory.com/getautocomplete_keyword.php"
with requests.Session() as s:
soup = BeautifulSoup(
s.get("http://www.tecomdirectory.com/companies.php?segment=&activity=&search=category&submit=Search").content,
"html.parser")
print([a["href"] for a in soup.select("a[href^=mailto:]")])
for i in range(1, 5):
data["group_no"] = str(i)
soup = BeautifulSoup(s.post(post, data=data).content, "html.parser")
print([a["href"] for a in soup.select("a[href^=mailto:]")])
To go until the end, you can loop until the post returns no html, that signifies we cannot load any more pages:
def yield_all_mails():
data = {"group_no": "1",
"search": "category",
"segment": "",
"activity": "",
"retail": "",
"category": "",
"Bpark": "",
"alpha": ""}
post = "http://www.tecomdirectory.com/getautocomplete_keyword.php"
start = "http://www.tecomdirectory.com/companies.php?segment=&activity=&search=category&submit=Search"
with requests.Session() as s:
resp = s.get(start)
soup = BeautifulSoup(s.get(start).content, "html.parser")
yield (a["href"] for a in soup.select("a[href^=mailto:]"))
i = 1
while resp.content.strip():
data["group_no"] = str(i)
resp = s.post(post, data=data)
soup = BeautifulSoup(resp.content, "html.parser")
yield (a["href"] for a in soup.select("a[href^=mailto:]"))
i += 1
So if we ran the function like below setting "alpha": "Z" to just iterate over the Z's:
from itertools import chain
for mail in chain.from_iterable(yield_all_mails()):
print(mail)
We would get:
mailto:info#10pearls.com
mailto:fady#24group.ae
mailto:pepe#2heads.tv
mailto:2interact#2interact.us
mailto:gc#worldig.com
mailto:marilyn.pais#3i-infotech.com
mailto:3mgulf#mmm.com
mailto:venkat#4gid.com
mailto:info#4power.biz
mailto:info#4sstudyabroad.com
mailto:fouad#622agency.com
mailto:sahar#7quality.com
mailto:mike.atack#8ack.com
mailto:zyara#emirates.net.ae
mailto:aokasha#zynx.com
Process finished with exit code 0
You should put a sleep in between requests so you don't hammer the server and get yourself blocked.