python to get the json data from multiple links - python

I am trying to extract the JSON data from multiple links, but looks like I am doing something wrong. I am getting only the last id details. How do I get the JSON data for all the links? Also, is it possible to export all the results to a CSV file?
Please kindly guide me.
Here is the code that I am using.
import json
import requests
from bs4 import BeautifulSoup
a_list = [234147,234548,232439,234599,226672,234117,222388]
a_url = 'https://jobs.mycareerportal/careers-home/jobs'
urls = []
for n in a_list:
kurl = '{}/{}'.format(a_url, n)
soup = BeautifulSoup(requests.get(kurl).content, "html.parser")
data = [
json.loads(x.string) for x in soup.find_all("script", type="application/ld+json")
]
for d in data:
k = str(d['url']) + str(d['jobLocation']['address'])
urls.append(kurl)
print(k)
and this is the output that I am getting
PS E:\Python> & C:/Users/KristyG/Anaconda3/python.exe e:/Python/url_append.py
https://jobs.mycareerportal/careers-home/jobs/222388?{'#type': 'PostalAddress', 'addressLocality': 'Panama City', 'addressRegion': 'Florida', 'streetAddress': '4121 Hwy 98', 'postalCode': '32401-1170', 'addressCountry': 'United States'}
PS E:\Python>
Please note, I had to change the website name as I can't share it on public

I guess its just an indentation problem, try nesting the code inside the first for loop like this :
import json
import requests
from bs4 import BeautifulSoup
a_list = [234147,234548,232439,234599,226672,234117,222388]
a_url = 'https://jobs.mycareerportal/careers-home/jobs'
urls = []
for n in a_list:
kurl = '{}/{}'.format(a_url, n)
soup = BeautifulSoup(requests.get(kurl).content, "html.parser")
data = [
json.loads(x.string) for x in soup.find_all("script", type="application/ld+json")
]
for d in data:
k = str(d['url']) + str(d['jobLocation']['address'])
urls.append(kurl)
print(k)

Related

Cannot getting the "href" attributes via BeautifulSoup

in short, i can't get the links of "href" attribute from this link (a turkish online book and related stuff seller).
here's my code (i know it's not the best, i'm learning python for a few months online, so any heads up for best practices also welcomed)
i tried to get book names, writers, prices, publishers and the links for each book; without links it's working as i expected.
import requests
import pandas as pd
from bs4 import BeautifulSoup
from time import sleep
from random import randint
yazar = []
fiyat = []
yayın = []
isim = []
for i in range(1,10):
url = "https://www.dr.com.tr/CokSatanlar/Kitap#/page="+str(i)
page = requests.get(url)
soup = BeautifulSoup(page.text, "lxml")
# book names
k = soup.find_all("a", {"class":"prd-name"})
for i in k:
isim.append(i.text)
# writer names
y = soup.find_all("a", {"class":"who text-overflow"})
for i in y:
yazar.append(i.text)
# prices
f = soup.find_all("div", {"class":"prd-price"})
for i in f:
fiyat.append(i.text.split()[0])
# publishers
ye = soup.find_all("a", {"class":"prd-publisher"})
for i in ye:
yayın.append(i.get("title"))
sleep(randint(2, 4))
however when i try to get links
soup.find_all("a", {"class":"prd-name"}).get("href")
it turns none and i couldn't manage to make this work whatever i tried.
thank you all in advance and sorry for a little longer than usual post.
The data you see on the page is loaded from external location, so you need other URL to get correct data:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = "https://www.dr.com.tr/Catalog/CatalogProducts"
data = {
"catalogId": "4020",
"page": "1",
"sortfield": "soldcount",
"sortorder": "desc",
"size": "60",
"categoryid": "0",
"parentId": "0",
"mediatypes": "",
"HideNotForSale": "true",
"minPrice": "-1",
"maxPrice": "-1",
"writer": "",
"minDiscount": "-1",
"maxdiscount": "-1",
"language": "",
}
all_data = []
for page in range(1, 3): # <-- increase number of pages here
print(f"Getting page {page}")
data["page"] = page
soup = BeautifulSoup(requests.post(url, data=data).content, "html.parser")
for p in soup.select(".prd-content"):
all_data.append(p.get_text(strip=True, separator="|").split("|")[:5])
df = pd.DataFrame(
all_data, columns=["name", "autor", "price", "type", "publisher"]
)
print(df)
df.to_csv("data.csv", index=False)
Prints:
name autor price type publisher
0 Esra Ezmeci Seti 5 Kitap Takım - Defter Hediyeli Esra Ezmeci 155,45 TL İnce Kapak Destek Yayınları
1 Şimdi Onlar Düşünsün Bircan Yıldırım 36,20 TL İnce Kapak Destek Yayınları
2 İz Bıraktığın Kadar Varsın Esra Ezmeci 36,20 TL İnce Kapak Destek Yayınları
...
and saves data.csv (screenshot from Libre Office):
Think you wont get a None you will get:
AttributeError: ResultSet object has no attribute 'get'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?
find_all() produces a ResultSet, so you have to iterate it to get all the href:
for a in soup.find_all("a", {"class":"prd-name"}):
print('https://www.dr.com.tr'+a.get("href"))
Output
https://www.dr.com.tr/kitap/daha-adil-bir-dunya-mumkun/arastirma-tarih/politika-arastirma/turkiye-politika-/urunno=0001934858001
https://www.dr.com.tr/kitap/burasi-cok-onemli-enerjiden-ekonomiye-tam-bagimsiz-turkiye/arastirma-tarih/politika-arastirma/turkiye-politika-/urunno=0001966362001
https://www.dr.com.tr/kitap/iz-biraktigin-kadar-varsin/egitim-basvuru/psikoloji-bilimi/urunno=0001947472001
https://www.dr.com.tr/kitap/simdi-onlar-dusunsun/bircan-yildirim/egitim-basvuru/kisisel-gelisim/urunno=0001964436001
https://www.dr.com.tr/kitap/kadinlar-sicak-erkekler-soguk-sever/esra-ezmeci/egitim-basvuru/psikoloji-bilimi/urunno=0001904239001
https://www.dr.com.tr/kitap/dustugunde-kalkarsan-hayat-guzeldir/egitim-basvuru/psikoloji-bilimi/urunno=0001816754001
...

Can't get info of a lxml site with Request and BeautifulSoup

I'm trying to make a testing project that scraps info of a specific site but with no success.
I followed some tutorials i have found and even an post on stackoverflow. After all this I'm stuck!
help me stepbrothers, I'm a hot new programmer with python and I can't stop my projects.
more info: this is a lottery website that I was trying to scrap and make some analisys to get a lucky number.
I have followed this tutorials:
https://towardsdatascience.com/how-to-collect-data-from-any-website-cb8fad9e9ec5
https://beautiful-soup-4.readthedocs.io/en/latest/
Using BeautifulSoup in order to find all "ul" and "li" elements
All of you have my gratitute!
from bs4 import BeautifulSoup as bs
import requests
import html5lib
#import urllib3 # another attemp to make another req in the url ------failed
url = '''https://loterias.caixa.gov.br/Paginas/Mega-Sena.aspx'''
#another try to take results in the <ul> but I have no qualified results == None
def parse_ul(elem):#https://stackoverflow.com/questions/50338108/using-beautifulsoup-in-order-to-find-all-ul-and-li-elements
result = {}
for sub in elem.find_all('li', recursive=False):
if sub.li is None:
continue
data = {k: v for k, v in sub.attrs.items()}
if sub.ul is not None:
# recurse down
data['children'] = parse_ul(sub.ul)
result[sub.li.get_text(strip=True)] = data
return result
page = requests.get(url)#taking info from website
print(page.encoding)# == UTF-8
soup = bs(page.content,features="lxml")#takes all info from the url and organizes it ==Beaultiful soup
numbers = soup.find(id='ulDezenas')#searcher in the content of this specific id// another try: soup.find('ul', {'class': ''})
result = parse_ul(soup)#try to parse info, but none is found EVEN WITH THE ORIGINAL ONE
print(numbers)#The result is below:
'''<ul class="numbers megasena" id="ulDezenas">
<li ng-repeat="dezena in resultado.listaDezenas ">{{dezena.length > 2 ? dezena.slice(1) : dezena}}</li>
</ul>'''
print(result)# == "{}" nothing found
#with open('''D:\Documents\python\_abretesesame.txt''', 'wb') as fd:
# for chunk in page.iter_content(chunk_size=128):
# fd.write(chunk)
# =======printing document(HTML) in file still no success in getting the numbers
Main issue is that the content is provided dynamically by JavaScript but you can get the information via another url:
jsonData = requests.get('https://servicebus2.caixa.gov.br/portaldeloterias/api/megasena/').json()
will give you folowing JSON:
{'tipoJogo': 'MEGA_SENA', 'numero': 2468, 'nomeMunicipioUFSorteio': 'SÃO PAULO, SP', 'dataApuracao': '02/04/2022', 'valorArrecadado': 158184963.0, 'valorEstimadoProximoConcurso': 3000000.0, 'valorAcumuladoProximoConcurso': 0.0, 'valorAcumuladoConcursoEspecial': 36771176.89, 'valorAcumuladoConcurso_0_5': 33463457.98, 'acumulado': False, 'indicadorConcursoEspecial': 1, 'dezenasSorteadasOrdemSorteio': ['022', '041', '053', '042', '035', '057'], 'listaResultadoEquipeEsportiva': None, 'numeroJogo': 2, 'nomeTimeCoracaoMesSorte': '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', 'tipoPublicacao': 3, 'observacao': '', 'localSorteio': 'ESPAÇO DA SORTE', 'dataProximoConcurso': '06/04/2022', 'numeroConcursoAnterior': 2467, 'numeroConcursoProximo': 2469, 'valorTotalPremioFaixaUm': 0.0, 'numeroConcursoFinal_0_5': 2470, 'listaDezenas': ['022', '035', '041', '042', '053', '057'], 'listaDezenasSegundoSorteio': None, 'listaMunicipioUFGanhadores': [{'posicao': 1, 'ganhadores': 1, 'municipio': 'SANTOS', 'uf': 'SP', 'nomeFatansiaUL': '', 'serie': ''}], 'listaRateioPremio': [{'faixa': 1, 'numeroDeGanhadores': 1, 'valorPremio': 122627171.8, 'descricaoFaixa': '6 acertos'}, {'faixa': 2, 'numeroDeGanhadores': 267, 'valorPremio': 34158.18, 'descricaoFaixa': '5 acertos'}, {'faixa': 3, 'numeroDeGanhadores': 20734, 'valorPremio': 628.38, 'descricaoFaixa': '4 acertos'}], 'id': None, 'ultimoConcurso': True, 'exibirDetalhamentoPorCidade': True, 'premiacaoContingencia': None}
Simply extract dezenasSorteadasOrdemSorteio and prozess in list comprehension:
[n if len(n) < 2 else n[1:] for n in jsonData['listaDezenas']]
Result will be:
['22', '35', '41', '42', '53', '57']
Example
import requests
jsonData = requests.get('https://servicebus2.caixa.gov.br/portaldeloterias/api/megasena/').json()
print([n if len(n) < 2 else n[1:] for n in jsonData['listaDezenas']])

Convert Python list to Json object

I have three lists emojiLink, emojiTitle, emojiDescription in my code below.
from bs4 import BeautifulSoup
import pandas as pd
r = requests.get("https://www.emojimeanings.net/list-smileys-people-whatsapp")
soup = BeautifulSoup(r.text, "lxml")
emojiLink = []
emojiTitle = []
emojiDescription = []
for tableRow in soup.find_all("tr", attrs={"class": "ugc_emoji_tr"}):
for img in tableRow.findChildren("img"):
emojiLink.append(img['src'])
for tableData in soup.find_all("td"):
for boldTag in tableData.findChildren("b"):
emojiTitle.append(boldTag.text)
for tableRow in soup.find_all("tr", attrs={"class": "ugc_emoji_tr"}):
for tabledata in tableRow.findChildren("td"):
if tabledata.has_attr("id"):
k = tabledata.text.strip().split('\n')[-1]
l = k.lstrip()
emojiDescription.append(l)
I want to convert these lists into a Json object which gonna look like...
{{"link": "emojiLink[0]", "title": "emojiTitle[0]", "desc": "emojiDescription[0]"},{"link": "emojiLink[1]", "title": "emojiTitle[1]", "desc": "emojiDescription[1]"}..........} so on...
I am not getting to how to do this?
THANKS IN ADVANCE!!!
This returns an array of JSON objects based off of Chandella07's answer.
from bs4 import BeautifulSoup
import pandas as pd
import requests
import json
r = requests.get("https://www.emojimeanings.net/list-smileys-people-whatsapp")
soup = BeautifulSoup(r.text, "lxml")
emojiLinkList = []
emojiTitleList = []
emojiDescriptionList = []
jsonData = []
for tableRow in soup.find_all("tr", attrs={"class": "ugc_emoji_tr"}):
for img in tableRow.findChildren("img"):
emojiLinkList.append(img['src'])
for tableData in soup.find_all("td"):
for boldTag in tableData.findChildren("b"):
emojiTitleList.append(boldTag.text)
for tableRow in soup.find_all("tr", attrs={"class": "ugc_emoji_tr"}):
for tabledata in tableRow.findChildren("td"):
if tabledata.has_attr("id"):
k = tabledata.text.strip().split('\n')[-1]
l = k.lstrip()
emojiDescriptionList.append(l)
for link, title, desc in zip(emojiLinkList, emojiTitleList, emojiDescriptionList):
dict = {"link": link, "title": title, "desc": desc}
jsonData.append(dict)
print(json.dumps(jsonData, indent=2))
Data Example:
{
"link": "https://www.emojimeanings.net/img/emojis/purse_1f45b.png",
"title": "Wallet",
"desc": "After the shopping trip, the money has run out or the wallet was forgotten at home. The accessory keeps loose money but also credit cards or make-up. Can refer to shopping or money and stand for femininity and everything girlish."
},
One by one access each element from list and put it into some dict and at the end append to a list:
import json
# some example lists
em_link = ['a', 'b', 'c']
em_title = ['x', 'y', 'z']
em_desc = [1,2,3]
arr = []
for i,j,k in zip(em_link, em_title, em_desc):
d = {}
d.update({"link": i})
d.update({"title": j})
d.update({"desc": k})
arr.append(d)
print(json.dumps(arr))
Output:
[{"link": "a", "title": "x", "desc": 1}, {"link": "b", "title": "y", "desc": 2}, {"link": "c", "title": "z", "desc": 3}]
There is something wrong with your dict format. {{...},{...}} is not a valid format, [{...},{...}] is valid.
Regarding the merging logic:
for i in zip([1,2,3], ["a", "b"], [8,9,10]):
print(i)
... will output ...
(1, 'a', 8)
(2, 'b', 9)
Try something like that:
out = []
for i in zip(emojiLink, emojiTitle, emojiDescription):
out.append({"link": i[0], ...})
You can use the json library to read/write in json format.
import json
with open('./smthn.json', 'w') as f:
json.dump({"a": "dictionary"}, f)
https://devtut.github.io/python/json-module.html#storing-data-in-a-file
So, you want a list of dictionary records? If you're sure all of the lists are the same length, you can do:
gather = []
for l,t,d in zip(emojiLink,emojiTitle,emojiDescription):
gather.append( {"link":l, "title":t, "desc":d} )
json.dump( gather, open("myrecord.json","w") )

Issues with Python BeautifulSoup parsing

I am trying to parse an html page with BeautifulSoup. The task is to get the data underlined with red color for all the lots on this page. I got the data from the left and the right block (about the lot, auction name, country etc) but getting the data from the central block seems to be problematic for me. Here is the example of what is done.
import requests
import re
from bs4 import BeautifulSoup as bs
import pandas as pd
URL_TEMPLATE = "https://www.artprice.com/artist/15079/wassily-kandinsky/lots/pasts?ipp=100"
FILE_NAME = "test"
def parse(url = URL_TEMPLATE):
result_list = {'lot': [], 'name': [], 'date': [], 'type1': [], 'type2': [], 'width': [], 'height': [], 'estimate': [], 'hummerprice': [], 'auction_date': [], 'auction': [], 'country': []}
r = requests.get(URL_TEMPLATE)
soup = bs(r.text, "html.parser")
lot_info = soup.find_all('p', class_='hidden-xs')
date_info = soup.find_all('date')
names_info = soup.find_all('a', class_='sln_lot_show')
auction_info = soup.find_all('p', class_='visible-xs')
auction_date_info = soup.find_all(string=re.compile('\d\d\s\w\w\w\s\d\d\d\d'))[1::2]
type1_info = soup.find_all('div')
for i in range(len(lot_info)):
result_list['lot'].append(lot_info[i].text)
for i in range(len(date_info)):
result_list['date'].append(date_info[i].text)
for i in range (len(names_info)):
result_list['name'].append(names_info[i].text)
for i in range(0, len(auction_info), 2):
result_list['auction'].append(soup.find_all('p', class_='visible-xs')[i].strong.string)
for i in range(1, len(auction_info), 2):
result_list['country'].append(soup.find_all('p', class_='visible-xs')[i].string)
for i in range(len(auction_date_info)):
result_list['auction_date'].append(auction_date_info[i])
return result_list
df = pd.DataFrame(data=parse())
df.to_excel("test.xlsx")
So, the task is to get the data from the central block separately for each lot on this page.
You need nth-of-type to access all those <p> elements.
This does it for just the first one to show that it works.
I'll leave it to you to clean up the output.
for div in soup.find_all('div',class_='col-xs-8 col-sm-6'):
print(div.select_one('a').text.strip())
print(div.select_one('p:nth-of-type(2)').text.strip())
print(div.select_one('p:nth-of-type(3)').text.strip())
print(div.select_one('p:nth-of-type(4)').text.strip())
break
Result:
Abstract
Print-Multiple, Print in colors, 29 1/2 x 31 1/2 in75 x 80 cm
Estimate:
€ 560 - € 784
$ 605 - $ 848
£ 500 - £ 700
¥ 4,303 - ¥ 6,025
Hammer price:
not communicated
not communicated
not communicated
not communicated

Using regex to search for text that follows a specific word

I am searching a string of text which contains dictionaries that look like so:
soup_string = """{"loadType":"","shiftId":"ROVR-DUMMY-SHIFTID","carbonFriendly":"no","cost":"£2.00","initialSlotPrice":"","timeSlotISO":"2019-06-13T12:00+01:00/13:00+01:00","isSameDayPremium":"false","stopId":"10446315588190612134701380","availability":"full","slotDiscountedByDP":"false","slotId":"1hr-12-13-20190613","time":"12:00pm - 1:00pm","rawSlotPrice":"","slotDiscounted":"false"},
{"loadType":"","shiftId":"ROVR-DUMMY-SHIFTID","carbonFriendly":"no","cost":"£2.00","initialSlotPrice":"","timeSlotISO":"2019-06-13T12:30+01:00/13:30+01:00","isSameDayPremium":"false","stopId":"10446315588190612134701380","availability":"available","slotDiscountedByDP":"false","slotId":"1hr-12:30-13:30-20190613","time":"12:30pm - 1:30pm","rawSlotPrice":"","slotDiscounted":"false"}"""
I am looking to return the string which follows each key in the 'dictionaries'.
I have decided an appropriate method is to use Regex expressions. I can return each times and costs using
Costs = re.findall(r"\£[0-9]\.[0-9][0-9]", soup_string)
times = re.findall(r'\"(time)\"\:\"(.{14,16})\"\,', soup_string)
Essentially I would like to be able to look for each key in the dictionary, and search for a specific string then return the value.
The end goal is to create a dictionary with the 'Cost', 'Availability' and 'time'.
Full code:
import requests
from bs4 import BeautifulSoup
import json
postcode = "L4 0TH"
ASDA_url = "https://groceries.asda.com/api/user/checkpostcode?postcode="+ postcode + "&requestorigin=gi"
ASDA_url2 = "https://groceries.asda.com/api/slot/view?startdate=12%2F06%2F2019&deliveryoption=homedelivery&requestorigin=gi&_="
client = requests.Session()
r = client.get(ASDA_url)
r2 = client.get(ASDA_url2)
soup = BeautifulSoup(r2.text, 'html.parser')
soup_string = str(soup)
soup_dicts = json.loads('[' + soup_string + ']')
keep_keys = ('cost', 'availability', 'time')
filtered = [{k:soup_dict[k] for k in keep_keys} for soup_dict in soup_dicts]```
Given that you have multiple dictionaries, I'm not exactly sure what you're trying to obtain, but from my understanding this should help:
import json
soup_string = ''' ... ''' # As it is in the question
soup_dicts = json.loads('[' + soup_string + ']')
keep_keys = ('cost', 'availability', 'time')
filtered = [{k:soup_dict[k] for k in keep_keys} for soup_dict in soup_dicts]
It treats your string of dictionaries as a list of JSON dictionaries, and uses the json module to parse it. Then it filters out everything except the key/value pairs you need. The result is a list of the filtered dictionaries.
Output (i.e. value of filtered):
[
{'cost': '£2.00', 'availability': 'full', 'time': '12:00pm - 1:00pm'},
{'cost': '£2.00', 'availability': 'available', 'time': '12:30pm - 1:30pm'}
]
EDIT:
In response to you providing your code, I can see that you're calling str on the results from BeautifulSoup. Rather than doing that, you can just process the client.get() results directly:
import json
import requests
postcode = "L4 0TH"
ASDA_url = "https://groceries.asda.com/api/user/checkpostcode?postcode="+ postcode + "&requestorigin=gi"
ASDA_url2 = "https://groceries.asda.com/api/slot/view?startdate=12%2F06%2F2019&deliveryoption=homedelivery&requestorigin=gi&_="
client = requests.Session()
r = client.get(ASDA_url)
r2 = client.get(ASDA_url2)
dicts = r2.json()['slotHeader'][0]['slots']
keep_keys = ('cost', 'availability', 'time')
filtered = [{k:d[k] for k in keep_keys} for d in dicts]
First you need to put your data into a list and create a dictionary with key: data. (see my example below). Then use json to convert it as a dictionary of dictionaries. Then extract cost, availability and time per dictionary on a loop.
import json
soup_string = """{"data": [{"loadType":"","shiftId":"ROVR-DUMMY-SHIFTID","carbonFriendly":"no","cost":"£2.00","initialSlotPrice":"","timeSlotISO":"2019-06-13T12:00+01:00/13:00+01:00","isSameDayPremium":"false","stopId":"10446315588190612134701380","availability":"full","slotDiscountedByDP":"false","slotId":"1hr-12-13-20190613","time":"12:00pm - 1:00pm","rawSlotPrice":"","slotDiscounted":"false"}, {"loadType":"","shiftId":"ROVR-DUMMY-SHIFTID","carbonFriendly":"no","cost":"£2.00","initialSlotPrice":"","timeSlotISO":"2019-06-13T12:30+01:00/13:30+01:00","isSameDayPremium":"false","stopId":"10446315588190612134701380","availability":"available","slotDiscountedByDP":"false","slotId":"1hr-12:30-13:30-20190613","time":"12:30pm - 1:30pm","rawSlotPrice":"","slotDiscounted":"false"}]}"""
d = json.loads(soup_string)
result = []
cost, avail, time = [], [], []
for data in d['data']:
tmp = {}
tmp['Cost'] = data['cost']
tmp['Availability'] = data['availability']
tmp['Time'] = data['time']
result.append(tmp)
result
Output:
[{'Cost': '£2.00', 'Availability': 'full', 'Time': '12:00pm - 1:00pm'},
{'Cost': '£2.00', 'Availability': 'available', 'Time': '12:30pm - 1:30pm'}]

Categories

Resources