Convert Python list to Json object - python

I have three lists emojiLink, emojiTitle, emojiDescription in my code below.
from bs4 import BeautifulSoup
import pandas as pd
r = requests.get("https://www.emojimeanings.net/list-smileys-people-whatsapp")
soup = BeautifulSoup(r.text, "lxml")
emojiLink = []
emojiTitle = []
emojiDescription = []
for tableRow in soup.find_all("tr", attrs={"class": "ugc_emoji_tr"}):
for img in tableRow.findChildren("img"):
emojiLink.append(img['src'])
for tableData in soup.find_all("td"):
for boldTag in tableData.findChildren("b"):
emojiTitle.append(boldTag.text)
for tableRow in soup.find_all("tr", attrs={"class": "ugc_emoji_tr"}):
for tabledata in tableRow.findChildren("td"):
if tabledata.has_attr("id"):
k = tabledata.text.strip().split('\n')[-1]
l = k.lstrip()
emojiDescription.append(l)
I want to convert these lists into a Json object which gonna look like...
{{"link": "emojiLink[0]", "title": "emojiTitle[0]", "desc": "emojiDescription[0]"},{"link": "emojiLink[1]", "title": "emojiTitle[1]", "desc": "emojiDescription[1]"}..........} so on...
I am not getting to how to do this?
THANKS IN ADVANCE!!!

This returns an array of JSON objects based off of Chandella07's answer.
from bs4 import BeautifulSoup
import pandas as pd
import requests
import json
r = requests.get("https://www.emojimeanings.net/list-smileys-people-whatsapp")
soup = BeautifulSoup(r.text, "lxml")
emojiLinkList = []
emojiTitleList = []
emojiDescriptionList = []
jsonData = []
for tableRow in soup.find_all("tr", attrs={"class": "ugc_emoji_tr"}):
for img in tableRow.findChildren("img"):
emojiLinkList.append(img['src'])
for tableData in soup.find_all("td"):
for boldTag in tableData.findChildren("b"):
emojiTitleList.append(boldTag.text)
for tableRow in soup.find_all("tr", attrs={"class": "ugc_emoji_tr"}):
for tabledata in tableRow.findChildren("td"):
if tabledata.has_attr("id"):
k = tabledata.text.strip().split('\n')[-1]
l = k.lstrip()
emojiDescriptionList.append(l)
for link, title, desc in zip(emojiLinkList, emojiTitleList, emojiDescriptionList):
dict = {"link": link, "title": title, "desc": desc}
jsonData.append(dict)
print(json.dumps(jsonData, indent=2))
Data Example:
{
"link": "https://www.emojimeanings.net/img/emojis/purse_1f45b.png",
"title": "Wallet",
"desc": "After the shopping trip, the money has run out or the wallet was forgotten at home. The accessory keeps loose money but also credit cards or make-up. Can refer to shopping or money and stand for femininity and everything girlish."
},

One by one access each element from list and put it into some dict and at the end append to a list:
import json
# some example lists
em_link = ['a', 'b', 'c']
em_title = ['x', 'y', 'z']
em_desc = [1,2,3]
arr = []
for i,j,k in zip(em_link, em_title, em_desc):
d = {}
d.update({"link": i})
d.update({"title": j})
d.update({"desc": k})
arr.append(d)
print(json.dumps(arr))
Output:
[{"link": "a", "title": "x", "desc": 1}, {"link": "b", "title": "y", "desc": 2}, {"link": "c", "title": "z", "desc": 3}]

There is something wrong with your dict format. {{...},{...}} is not a valid format, [{...},{...}] is valid.
Regarding the merging logic:
for i in zip([1,2,3], ["a", "b"], [8,9,10]):
print(i)
... will output ...
(1, 'a', 8)
(2, 'b', 9)
Try something like that:
out = []
for i in zip(emojiLink, emojiTitle, emojiDescription):
out.append({"link": i[0], ...})
You can use the json library to read/write in json format.
import json
with open('./smthn.json', 'w') as f:
json.dump({"a": "dictionary"}, f)
https://devtut.github.io/python/json-module.html#storing-data-in-a-file

So, you want a list of dictionary records? If you're sure all of the lists are the same length, you can do:
gather = []
for l,t,d in zip(emojiLink,emojiTitle,emojiDescription):
gather.append( {"link":l, "title":t, "desc":d} )
json.dump( gather, open("myrecord.json","w") )

Related

Create nested list from another list based on byte size check to dynamically incremented nested list name

I have a list filled with paragraph tags (<p></p>) content from a site using beautifulsoup4.
I would like to break that list into a nested list with the sublists name being dynamically incremented and have this incrementation to be based on a byte size check of the current nested list. The result should be used to create a json object afterwards.
My current code for example:
import requests, sys
from bs4 import BeautifulSoup
def getContent():
page = requests.get("www.example.com")
soup = BeautifulSoup(page.content, "html.parser")
results = soup.prettify()
data = {}
SECTION_INDEX = 1
data_container = []
article_section_size = 0
article_section_data = []
for tag in soup.find_all("p"):
text = tag.text
data_container.append(text)
for p in data_container:
article_section = "CONTENT_SECTION_" + str(SECTION_INDEX)
article_section_data.append(p)
data[article_section] = article_section_data
article_section_size += sys.getsizeof(p)
if article_section_size >= 300:
SECTION_INDEX = SECTION_INDEX + 1
return(data)
def createJson():
data = getContent()
json_source = {
"ARTICLE_DATA": data
}
json_object = json.dumps(json_source, indent=2)
def main():
createJson()
The actual result:
{
"CONTENT_DATA": {
"CONTENT_SECTION_1": [
"the actual paragraphs",
"content goes there",
"some more content".
"even more content from the site",
"and some even more",
"and finally, some more"
],
"CONTENT_SECTION_2": [
"the actual paragraphs",
"content goes there",
"some more content".
"even more content from the site",
"and some even more",
"and finally, some more"
],
"CONTENT_SECTION_3": [
"the actual paragraphs",
"content goes there",
"some more content".
"even more content from the site",
"and some even more",
"and finally, some more"
]
}
}
The desired result:
{
"CONTENT_DATA": {
"CONTENT_SECTION_1": [
"the actual paragraphs",
"content goes there"
],
"CONTENT_SECTION_2": [
"some more content",
"even more content from the site"
],
"CONTENT_SECTION_3": [
"and some even more",
"and finally, some more"
]
}
}
How to achieve this and why the repeated pattern from the actual result above?
For anyone interested in the solution, the code to achieve the desired result is the following:
import requests, sys
from bs4 import BeautifulSoup
def getContent():
page = requests.get("www.example.com")
soup = BeautifulSoup(page.content, "html.parser")
results = soup.prettify()
data = {}
SECTION_INDEX = 1
data_container = []
article_section_size = 0
article_section_data = []
for tag in soup.find_all("p"):
text = tag.text
data_container.append(text)
for p in data container:
article_section = "CONTENT_SECTION_" + str(SECTION_INDEX)
article_section_data.append(p)
article_section_size += sys.getsizeof(p)
if article_section_size >= 300:
data[article_section] = article_section_data
article_section_data = []
article_section_size = 0
SECTION_INDEX = SECTION_INDEX + 1
if article_section_data:
data[article_section] = article_section_data
return(data)
The repeated pattern from the actual result was due to the fact that I was always appending the p element to the article_section_data list, rather than resetting it to an empty list when the desired byte size has been reached.

Creating multiple dataframe using loop or function

I'm trying to extract the hash rate for 3 cryptocurrencies and I have attached the code for the same below. Now, I want to pass three urls and in return I need three different different dictionaries which should have the values. I'm stuck and I don't understand how should I go about it. I have tried using loops but it is not working out for me.
url = {'Bitcoin' : 'https://bitinfocharts.com/comparison/bitcoin-hashrate.html#3y',
'Ethereum': 'https://bitinfocharts.com/comparison/ethereum-hashrate.html#3y',
'Litecoin': 'https://bitinfocharts.com/comparison/litecoin-hashrate.html'}
for ele in url:
#### requesting the page and extracting the script which has date and values
session = requests.Session()
page = session.get(ele[i])
soup = BeautifulSoup(page.content, 'html.parser')
values = str(soup.find_all('script')[4])
values = values.split('d = new Dygraph(document.getElementById("container"),')[1]
#create an empty dict to append date and hashrates
dict([("crypto_1 %s" % i,[]) for i in range(len(url))])
#run a loop over all the dates and adding to dictionary
for i in range(values.count('new Date')):
date = values.split('new Date("')[i+1].split('"')[0]
value = values.split('"),')[i+1].split(']')[0]
dict([("crypto_1 %s" % i)[date] = value
You can use next example how to get data from all 3 URLs and create a dataframe/dictionary from it:
import re
import requests
import pandas as pd
url = {
"Bitcoin": "https://bitinfocharts.com/comparison/bitcoin-hashrate.html#3y",
"Ethereum": "https://bitinfocharts.com/comparison/ethereum-hashrate.html#3y",
"Litecoin": "https://bitinfocharts.com/comparison/litecoin-hashrate.html",
}
data = []
for name, u in url.items():
html_doc = requests.get(u).text
for date, hash_rate in re.findall(
r'\[new Date\("(.*?)"\),(.*?)\]', html_doc
):
data.append(
{
"Name": name,
"Date": date,
"Hash Rate": float("nan")
if hash_rate == "null"
else float(hash_rate),
}
)
df = pd.DataFrame(data)
df["Date"] = pd.to_datetime(df["Date"])
# here save df to CSV
# this will create a dictionary, where the keys are crypto names and values
# are dicts with keys Date/HashRate:
out = {}
for name, g in df.groupby("Name"):
out[name] = g[["Date", "Hash Rate"]].to_dict(orient="list")
print(out)
Prints:
{
"Bitcoin": {
"Date": [
Timestamp("2009-01-03 00:00:00"),
Timestamp("2009-01-04 00:00:00"),
Timestamp("2009-01-05 00:00:00"),
...

Cannot getting the "href" attributes via BeautifulSoup

in short, i can't get the links of "href" attribute from this link (a turkish online book and related stuff seller).
here's my code (i know it's not the best, i'm learning python for a few months online, so any heads up for best practices also welcomed)
i tried to get book names, writers, prices, publishers and the links for each book; without links it's working as i expected.
import requests
import pandas as pd
from bs4 import BeautifulSoup
from time import sleep
from random import randint
yazar = []
fiyat = []
yayın = []
isim = []
for i in range(1,10):
url = "https://www.dr.com.tr/CokSatanlar/Kitap#/page="+str(i)
page = requests.get(url)
soup = BeautifulSoup(page.text, "lxml")
# book names
k = soup.find_all("a", {"class":"prd-name"})
for i in k:
isim.append(i.text)
# writer names
y = soup.find_all("a", {"class":"who text-overflow"})
for i in y:
yazar.append(i.text)
# prices
f = soup.find_all("div", {"class":"prd-price"})
for i in f:
fiyat.append(i.text.split()[0])
# publishers
ye = soup.find_all("a", {"class":"prd-publisher"})
for i in ye:
yayın.append(i.get("title"))
sleep(randint(2, 4))
however when i try to get links
soup.find_all("a", {"class":"prd-name"}).get("href")
it turns none and i couldn't manage to make this work whatever i tried.
thank you all in advance and sorry for a little longer than usual post.
The data you see on the page is loaded from external location, so you need other URL to get correct data:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = "https://www.dr.com.tr/Catalog/CatalogProducts"
data = {
"catalogId": "4020",
"page": "1",
"sortfield": "soldcount",
"sortorder": "desc",
"size": "60",
"categoryid": "0",
"parentId": "0",
"mediatypes": "",
"HideNotForSale": "true",
"minPrice": "-1",
"maxPrice": "-1",
"writer": "",
"minDiscount": "-1",
"maxdiscount": "-1",
"language": "",
}
all_data = []
for page in range(1, 3): # <-- increase number of pages here
print(f"Getting page {page}")
data["page"] = page
soup = BeautifulSoup(requests.post(url, data=data).content, "html.parser")
for p in soup.select(".prd-content"):
all_data.append(p.get_text(strip=True, separator="|").split("|")[:5])
df = pd.DataFrame(
all_data, columns=["name", "autor", "price", "type", "publisher"]
)
print(df)
df.to_csv("data.csv", index=False)
Prints:
name autor price type publisher
0 Esra Ezmeci Seti 5 Kitap Takım - Defter Hediyeli Esra Ezmeci 155,45 TL İnce Kapak Destek Yayınları
1 Şimdi Onlar Düşünsün Bircan Yıldırım 36,20 TL İnce Kapak Destek Yayınları
2 İz Bıraktığın Kadar Varsın Esra Ezmeci 36,20 TL İnce Kapak Destek Yayınları
...
and saves data.csv (screenshot from Libre Office):
Think you wont get a None you will get:
AttributeError: ResultSet object has no attribute 'get'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?
find_all() produces a ResultSet, so you have to iterate it to get all the href:
for a in soup.find_all("a", {"class":"prd-name"}):
print('https://www.dr.com.tr'+a.get("href"))
Output
https://www.dr.com.tr/kitap/daha-adil-bir-dunya-mumkun/arastirma-tarih/politika-arastirma/turkiye-politika-/urunno=0001934858001
https://www.dr.com.tr/kitap/burasi-cok-onemli-enerjiden-ekonomiye-tam-bagimsiz-turkiye/arastirma-tarih/politika-arastirma/turkiye-politika-/urunno=0001966362001
https://www.dr.com.tr/kitap/iz-biraktigin-kadar-varsin/egitim-basvuru/psikoloji-bilimi/urunno=0001947472001
https://www.dr.com.tr/kitap/simdi-onlar-dusunsun/bircan-yildirim/egitim-basvuru/kisisel-gelisim/urunno=0001964436001
https://www.dr.com.tr/kitap/kadinlar-sicak-erkekler-soguk-sever/esra-ezmeci/egitim-basvuru/psikoloji-bilimi/urunno=0001904239001
https://www.dr.com.tr/kitap/dustugunde-kalkarsan-hayat-guzeldir/egitim-basvuru/psikoloji-bilimi/urunno=0001816754001
...

Find coordinates in wikipedia pages iterating over a list

Probably this is a simple question, but my experience in for loop is very limited.
I was trying to adapt the solution in this page https://www.mediawiki.org/wiki/API:Geosearch with some simple examples that i have, but the result is not what i expected.
For example:
I have this simple data frame:
df= pd.DataFrame({'City':['Sesimbra','Ciudad Juárez','31100 Treviso','Ramada Portugal','Olhão'],
'Country':['Portugal','México','Itália','Portugal','Portugal']})
I created a list based on cities:
lista_cidades = list(df['City'])
and i would like to iterate over this list to get the coordinates (decimal, preferably)
So far i tried this approach:
import requests
lng_dict = {}
lat_dict = {}
S = requests.Session()
URL = "https://en.wikipedia.org/w/api.php"
PARAMS = {
"action": "query",
"format": "json",
"titles": [lista_cidades],
"prop": "coordinates"
}
R = S.get(url=URL, params=PARAMS)
DATA = R.json()
PAGES = DATA['query']['pages']
for i in range(len(lista_cidades)):
for k, v in PAGES.items():
try:
lat_dict[lista_cidades[i]] = str(v['coordinates'][0]['lat'])
lng_dict[lista_cidades[i]] = str(v['coordinates'][0]['lon'])
except:
pass
but it looks like the code doesn't iterate over the list and always returns the same coordinate
For example, when i call the dictionary with latitude coordinates, this is what i get
lng_dict
{'Sesimbra': '-7.84166667',
'Ciudad Juárez': '-7.84166667',
'31100 Treviso': '-7.84166667',
'Ramada Portugal': '-7.84166667',
'Olhão': '-7.84166667'}
What should i do to solve this?
Thanks in advance
I think the query returns only one result, it will take only the last city from you list (in your cas the "Olhão" coordinates).
You can check it by logging the DATA content.
I do not know about wikipedia API, but either your call lack a parameter (documentation should give you the information) or you have to call the API for each city like :
import pandas as pd
import requests
df = pd.DataFrame({'City': ['Sesimbra', 'Ciudad Juárez', '31100 Treviso', 'Ramada Portugal', 'Olhão'],
'Country': ['Portugal', 'México', 'Itália', 'Portugal', 'Portugal']})
lista_cidades = list(df['City'])
lng_dict = {}
lat_dict = {}
S = requests.Session()
URL = "https://en.wikipedia.org/w/api.php"
for city in lista_cidades:
PARAMS = {
"action": "query",
"format": "json",
"titles": city,
"prop": "coordinates"
}
R = S.get(url=URL, params=PARAMS)
DATA = R.json()
PAGES = DATA['query']['pages']
for k, v in PAGES.items():
try:
lat_dict[city] = str(v['coordinates'][0]['lat'])
lng_dict[city] = str(v['coordinates'][0]['lon'])
except:
pass

Convert download string in pattern like [{" t": "1 ", "id": "NOW.976818" .... "cv": "1"}] into Pd dataframe?

I downloaded list of news content into pandas dataframe. Instead of putting the info into table, pd put everything into a single cell. Upon inspection, the downloaded string is in this pattern:
"['[{"t": "1", "id": "NOW.976818", "dt": "2019/11/15 10:13", "h": "《美股業績》Nvidia季績勝預期 季度收入預測遜預期", "u": "",...
How to convert this into pd table?
My codes:
urlpull ="http://www.aastocks.com/tc/resources/datafeed/getmorenews.ashx?cat=result-announcement&newstime=942660890&newsid=NOW.976800&period=0&key="
df = pd.DataFrame({'News': ['a'], 'Page': ['1']})
result = requests.get(urlpull)
result.raise_for_status()
result.encoding = "utf-8"
src = result.content
soup = BeautifulSoup(src, 'lxml')
news = []
for a_tag in soup.find_all('p'):
news.append(a_tag.text)
df = df.append(pd.DataFrame(news, columns=['News']))
print(news)
df['num'] = df['News'].str.extract('(\d{5})')
df["stock_num"] = pd.to_numeric(df["num"], errors="coerce").fillna(0).astype("int64")
print (df)
df.to_excel("News.xlsx")
you can do directly
pd.read_table(filename/url)

Categories

Resources