I'm writing a function to filter tweet data that contains search word.
Here's my code:
def twitter_filter(df, search):
coun = 0
date_ls = []
id_ls = []
content_ls = []
lan_ls = []
name_ls = []
retweet_ls = []
cleaned_tweet_ls = []
for i, row in df.iterrows():
if search in row.cleaned_tweet:
date_ls.append(row.date)
id_ls.append(row.id)
content_ls.append(row.content)
lan_ls.append(row.language)
name_ls.append(row.name)
retweet_ls.append(row.retweet)
cleaned_tweet_ls.append(row.cleaned_tweet)
new_dict = {
"date": date_ls,
"id": id_ls,
"content": content_ls,
"lan" : lan_ls,
"name" : name_ls,
"retweet" : retweet_ls,
"cleaned_tweeet": cleaned_tweet_ls,
}
new_df = pd.DataFrame(new_dict)
return new_df
Before filter:
cleandf['name']
Out[6]:
0 PryZmRuleZZ
1 Arbitration111
2 4kjweed
3 THEREALCAMOJOE
5 DailyBSC_
130997 Rabbitdogebsc
130999 gmtowner
131000 topcryptostats
131001 vGhostvRiderv
131002 gmtowner
Name: name, Length: 98177, dtype: object
After filter, user's name becomes random integer:
cleanedogetweet['name']
Out[7]:
0 3
1 5
2 9
3 12
4 34
80779 130997
80780 130999
80781 131000
80782 131001
80783 131002
Name: name, Length: 80784, dtype: int64
This problem only happened in user's name columns, other columns that contains string are ok.
I expected to remain the original user name, how can i solve the problem ?
In pandas dataframes, each row has an attribute called name.
You can use the name attribute to get the name of the row. By default, the name of the row is the index of the row.
So it's better that your column name would not be name because it will conflict with the name attribute of the row.
You can use the rename method to rename the column name and use another name like username, or you can change your function to this:
def twitter_filter(df, search):
coun = 0
date_ls = []
id_ls = []
content_ls = []
lan_ls = []
name_ls = []
retweet_ls = []
cleaned_tweet_ls = []
for i, row in df.iterrows():
if search in row.cleaned_tweet:
date_ls.append(row['date'])
id_ls.append(row['id'])
content_ls.append(row['content'])
lan_ls.append(row['language'])
name_ls.append(row['name'])
retweet_ls.append(row['retweet'])
cleaned_tweet_ls.append(row['cleaned_tweet'])
new_dict = {
"date": date_ls,
"id": id_ls,
"content": content_ls,
"lan": lan_ls,
"user_name": name_ls,
"retweet": retweet_ls,
"cleaned_tweeet": cleaned_tweet_ls,
}
new_df = pd.DataFrame(new_dict)
return new_df
Related
I am getting following string format from csv file in Pandas
"title = matrix, genre = action, year = 2000, rate = 8"
How can I change the string value into a python dictionary like this:
movie = "title = matrix, genre = action, year = 2000, rate = 8"
movie = {
"title": "matrix",
"genre": "action",
"year": "1964",
"rate":"8"
}
You can split the string and then convert it into a dictionary.
A sample code is given below
movie = "title = matrix, genre = action, year = 2000, rate = 8"
movie = movie.split(",")
# print(movie)
tempMovie = [i.split("=") for i in movie]
movie = {}
for i in tempMovie:
movie[i[0].strip()] = i[1].strip()
print(movie)
For the solution you can use regex
import re
input_user = "title = matrix, genre = action, year = 2000, rate = 8"
# Create a pattern to match the key-value pairs
pattern = re.compile(r"(\w+) = ([\w,]+)" )
# Find all matches in the input string
matches = pattern.findall(input_user)
# Convert the matches to a dictionary
result = {key: value for key, value in matches}
print(result)
The result:
{'title': 'matrix,', 'genre': 'action,', 'year': '2000,', 'rate': '8'}
I hope this can solve your problem.
movie = "title = matrix, genre = action, year = 2000, rate = 8"
dict_all_movies = {}
for idx in df.index:
str_movie = df.at[idx, str_movie_column]
movie_dict = dict(item.split(" = ") for item in str_movie.split(", "))
dict_all_movies[str(idx)] = movie_dict
I'm trying to extract the hash rate for 3 cryptocurrencies and I have attached the code for the same below. Now, I want to pass three urls and in return I need three different different dictionaries which should have the values. I'm stuck and I don't understand how should I go about it. I have tried using loops but it is not working out for me.
url = {'Bitcoin' : 'https://bitinfocharts.com/comparison/bitcoin-hashrate.html#3y',
'Ethereum': 'https://bitinfocharts.com/comparison/ethereum-hashrate.html#3y',
'Litecoin': 'https://bitinfocharts.com/comparison/litecoin-hashrate.html'}
for ele in url:
#### requesting the page and extracting the script which has date and values
session = requests.Session()
page = session.get(ele[i])
soup = BeautifulSoup(page.content, 'html.parser')
values = str(soup.find_all('script')[4])
values = values.split('d = new Dygraph(document.getElementById("container"),')[1]
#create an empty dict to append date and hashrates
dict([("crypto_1 %s" % i,[]) for i in range(len(url))])
#run a loop over all the dates and adding to dictionary
for i in range(values.count('new Date')):
date = values.split('new Date("')[i+1].split('"')[0]
value = values.split('"),')[i+1].split(']')[0]
dict([("crypto_1 %s" % i)[date] = value
You can use next example how to get data from all 3 URLs and create a dataframe/dictionary from it:
import re
import requests
import pandas as pd
url = {
"Bitcoin": "https://bitinfocharts.com/comparison/bitcoin-hashrate.html#3y",
"Ethereum": "https://bitinfocharts.com/comparison/ethereum-hashrate.html#3y",
"Litecoin": "https://bitinfocharts.com/comparison/litecoin-hashrate.html",
}
data = []
for name, u in url.items():
html_doc = requests.get(u).text
for date, hash_rate in re.findall(
r'\[new Date\("(.*?)"\),(.*?)\]', html_doc
):
data.append(
{
"Name": name,
"Date": date,
"Hash Rate": float("nan")
if hash_rate == "null"
else float(hash_rate),
}
)
df = pd.DataFrame(data)
df["Date"] = pd.to_datetime(df["Date"])
# here save df to CSV
# this will create a dictionary, where the keys are crypto names and values
# are dicts with keys Date/HashRate:
out = {}
for name, g in df.groupby("Name"):
out[name] = g[["Date", "Hash Rate"]].to_dict(orient="list")
print(out)
Prints:
{
"Bitcoin": {
"Date": [
Timestamp("2009-01-03 00:00:00"),
Timestamp("2009-01-04 00:00:00"),
Timestamp("2009-01-05 00:00:00"),
...
I have the following data:
data = {
'employee' : ['Emp1', 'Emp2', 'Emp3', 'Emp4', 'Emp5'],
'code' : ['2018_1', '2018_3', '2019_1', '2019_2', '2017_1'],
}
old_salary_bonus = 3000
new_salary_bonus = {
'2019_1': 1000,
'2019_2': 980,
}
df = pd.DataFrame(data)
Task: Add df['salary_bonus'] column based on the following condition:
If employee's code contains '2019', use code value to retrieve salary bonus value from new_salary_bonus, else use old_salary_bonus value.
Expected Output:
employee code salary_bonus
0 Emp1 2018_1 3000
1 Emp2 2018_3 3000
2 Emp3 2019_1 1000
3 Emp4 2019_2 980
4 Emp5 2017_1 3000
Please help.
Use Series.map with Series.fillna for repplace non matched values:
import pandas as pd
data = {
'employee' : ['Emp1', 'Emp2', 'Emp3', 'Emp4', 'Emp5'],
'code' : ['2018_1', '2018_3', '2019_1', '2019_2', '2017_1'],
}
old_salary_bonus = 3000
new_salary_bonus = {
'2019_1': 1000,
'2019_2': 980,
}
df = pd.DataFrame(data)
df['salary_bonus'] = df['code'].map(new_salary_bonus).fillna(old_salary_bonus)
print (df)
employee code salary_bonus
0 Emp1 2018_1 3000.0
1 Emp2 2018_3 3000.0
2 Emp3 2019_1 1000.0
3 Emp4 2019_2 980.0
4 Emp5 2017_1 3000.0
Another solution with get with default value if not matched:
df['salary_bonus'] = df['code'].map(lambda x: new_salary_bonus.get(x, old_salary_bonus))
You can use the code below:
df['salary_bonus'] = old_salary_bonus
df.loc[df['code'].isin(list(new_salary_bonus)), 'salary_bonus'] = list(new_salary_bonus.values())
I wanna put excel data to the dictionary.
Excel is
views.py is
#coding:utf-8
from django.shortcuts import render
import xlrd
book3 = xlrd.open_workbook('./data/excel.xlsx')
sheet3 = book3.sheet_by_index(0)
large_item = None
data_dict = {}
for row_index in range(1,sheet3.nrows):
rows3 = sheet3.row_values(row_index)
large_item = rows3[1] or large_item
data_dict = rows3
Now when I printed out print(data_dict),['', '4', '10', 'Karen', ''] was shown.Before,I wrote data_dict.extend(rows3) in place of data_dict = rows3,but in that time dict has not extend error happens.My ideal output is
data_dict = {
1: {
user_id: 1,
name_id: 1,
name: Blear,
age: 40,
man: false,
employee: leader,
},
2: {
user_id: 2,
name_id: 5,
・
・
・
},
・
・
・
}
How should I write to achieve my goal?
Your problem is :
data_dict = rows3
This doesn't add rows3 to data_dict, this set is value. So data_dict is equal to the last row.
To add element to a dict you need to do this:
data_dict[KEY] = VALUE
Your key will be the row index.
Now, you want another dict like VALUE
{
user_id: 1,
name_id: 1,
name: Blear,
age: 40,
man: false,
employee: leader,
}
So for each row you need to construct this dict, use headers and cell value to do it.
I don't test this code, it's just to give you an idea to how to do it.
#coding:utf-8
from django.shortcuts import render
import xlrd
book3 = xlrd.open_workbook('./data/excel.xlsx')
sheet3 = book3.sheet_by_index(0)
headers = sheet3.row_values(0)
large_item = None
data_dict = {}
for row_index in range(1,sheet3.nrows):
rows3 = sheet3.row_values(row_index)
large_item = rows3[1] or large_item
# Create dict with headers and row values
row_data = {}
for idx_col,value in enumerate(rows3):
header_value = headers[idx_col]
# Avoid to add empty column. A column in your example
if header_value:
row_data[headers[idx_col]] = value
# Add row_data to your data_dict with
data_dict[row_index] = row_data
You can use python's library pandas for an easy solution:
from pandas import *
xls = ExcelFile('your_excel_file.xls')
df = xls.parse(xls.sheet_names[0])
df.to_dict()
I want to build dynamic dictionary in python like this,,
users = {log_id : { "message_id" : "1", "sent_to" : "taqi.official#outlook.com" , "unique_arguments" : "03455097679"},
log_id : { "message_id" : "1", "sent_to" : "taqi.hass#cogilent.com" , "unique_arguments" : "03455097679" },
log_id : { "message_id" : "2 Turab", "sent_to" : "taqi.official#gmailllllll.com" , "unique_arguments" : "4534534535" }}
I have write this code but it is not building as i desired;
cur = conn.cursor()
cur.execute("select log_id, message_id, sent_to, unique_arguments from sendmessage_log_messages where log_status = 'Queue'")
rows = cur.fetchall()
count = 0
for row in rows:
temp['message_id'] = row[1]
temp['sent_to'] = str(row[2])
temp['unique_arguments'] = row[3]
log_dictionary[row[0]] = temp
print log_dictionary
It produces the this output,
{1: {'unique_arguments': 'log_8_taqi.official#gmailllllll.com', 'message_id': 8, 'sent_to': 'taqi.official#gmailllllll.com'},
2: {'unique_arguments': 'log_8_taqi.official#gmailllllll.com', 'message_id': 8, 'sent_to': 'taqi.official#gmailllllll.com'},
3: {'unique_arguments': 'log_8_taqi.official#gmailllllll.com', 'message_id': 8, 'sent_to': 'taqi.official#gmailllllll.com'},
4: {'unique_arguments': 'log_8_taqi.official#gmailllllll.com', 'message_id': 8, 'sent_to': 'taqi.official#gmailllllll.com'}}
Since no answer is given here, I am just giving the explanation here.
Here, you are replacing the temp dictionary again and again. When you do log_dictionary[row[0]] = temp, this just points to the existing temp dictionary. Thus whenever you change any value in the temp dictionary and you read the log_dictionary, this will always give you the updated temp dictionary only.
Try this:
for row in rows:
temp['message_id'] = row[1]
temp['sent_to'] = str(row[2])
temp['unique_arguments'] = row[3]
log_dictionary[row[0]] = temp
print log_dictionary
temp['message_id'] = 0
temp['send_to'] = 'stackoverflow'
temp['unique_arguments'] = None
print log_dictionary
This will give you the updated temp dictionary values as it is just referenced with the log_dictionary. As Ashwini Chaudhary mentioned in your comment, if you initialize a temp dictionary within the for loop before referencing it to log_dictionary, you will get the desired values. (eg)
for row in rows:
temp = {}
temp['message_id'] = row[1]
temp['sent_to'] = str(row[2])
temp['unique_arguments'] = row[3]
log_dictionary[row[0]] = temp