Trying To Get Similar Username From Six Different Files - python

I have scraped six different people's followers list from instagram and trying to get the usernames of people that are same in all six accounts but so far it is not accurate so any help would be appreciated.
Here is my code to open and read through json files with followers list and sort them in a dictionary according to their first two letters and compare them
import json
with open('./JSONs Old/A.json', 'r', encoding='utf-8') as f:
A = json.load(f)
with open('./JSONs Old/B.json', 'r', encoding='utf-8') as f:
B = json.load(f)
with open('./JSONs Old/C.json', 'r', encoding='utf-8') as f:
C = json.load(f)
with open('./JSONs Old/D.json', 'r', encoding='utf-8') as f:
D = json.load(f)
with open('./JSONs Old/E.json', 'r', encoding='utf-8') as f:
E = json.load(f)
with open('./JSONs Old/F.json', 'r', encoding='utf-8') as f:
F = json.load(f)
with open('./JSONs Old/G.json', 'r', encoding='utf-8') as f:
G = json.load(f)
Als = {}
Bls = {}
Cls = {}
Dls = {}
Els = {}
Fls = {}
Gls = {}
# Loop For A
for each in A:
if each['id'][:2] in Als.keys():
Als[each['id'][:2]].append(each)
else:
Als[each['id'][:2]] = [each]
# Loop For B
for each in B:
if each['id'][:2] in Bls.keys():
Bls[each['id'][:2]].append(each)
else:
Bls[each['id'][:2]] = [each]
# Loop For C
for each in C:
if each['id'][:2] in Cls.keys():
Cls[each['id'][:2]].append(each)
else:
Cls[each['id'][:2]] = [each]
# Loop For D
for each in D:
if each['id'][:2] in Dls.keys():
Dls[each['id'][:2]].append(each)
else:
Dls[each['id'][:2]] = [each]
# Loop For E
for each in E:
if each['id'][:2] in Els.keys():
Els[each['id'][:2]].append(each)
else:
Els[each['id'][:2]] = [each]
# Loop For F
for each in F:
if each['id'][:2] in Fls.keys():
Fls[each['id'][:2]].append(each)
else:
Fls[each['id'][:2]] = [each]
# Loop For G
for each in G:
if each['id'][:2] in Gls.keys():
Gls[each['id'][:2]].append(each)
else:
Gls[each['id'][:2]] = [each]
matchls = []
for i in B:
if (i['id'][:2] in Als.keys()) and (i['id'][:2] in Cls.keys()) and (i['id'][:2] in Dls.keys()) and (i['id'][:2] in Els.keys()) and (i['id'][:2] in Fls.keys()) and (i['id'][:2] in Gls.keys()):
matchls.append(i)
print(matchls)
The Json files have list of all the followers a person have on their instagram page and list container two key value pairs like following
[
{
"name": "Name1",
"id": "username1"
},
{
"name": "Name2",
"id": "username2"
}
]
i want to check if id from one file is in other five files too.
Thanks in Advance.

Here's one way to do it without sorting:
Using this function you can see if a follower's id is in any other json lists
# (param 1) follower: a single dict with including a key 'id'
# (param 2) follower_lists: list of loaded json files to check an id match
def compareFollowers(follower, follower_lists):
for list in follower_lists: # loop through each json file
if not any(follower['id'] == f['id'] for f in list): # check if the id is the same as another from the list
return False # if there is no common ids, return False
return True # if every list had a common id return True
FYI The any() function returns a boolean value:
True if at least one element of an iterable is true
False if all elements are false or if an iterable is empty
to print all common followers between all files you can do this:
import json
with open('./JSONs Old/A.json', 'r', encoding='utf-8') as f:
A = json.load(f)
with open('./JSONs Old/B.json', 'r', encoding='utf-8') as f:
B = json.load(f)
with open('./JSONs Old/C.json', 'r', encoding='utf-8') as f:
C = json.load(f)
with open('./JSONs Old/D.json', 'r', encoding='utf-8') as f:
D = json.load(f)
with open('./JSONs Old/E.json', 'r', encoding='utf-8') as f:
E = json.load(f)
with open('./JSONs Old/F.json', 'r', encoding='utf-8') as f:
F = json.load(f)
with open('./JSONs Old/G.json', 'r', encoding='utf-8') as f:
G = json.load(f)
follower_lists = [B, C, D, E, F, G] # include every list but the first
for follower in A: # loop through the first list and compare each follower's id
if compareFollowers(follower, follower_lists):
print(follower)

Related

Write values of a dictionary one in each line with DictWriter

So I have a Python dictionary with protein sequences and their ids. I wanted to convert that dictionary to a CSV file to upload it as a dataset to fine-tune a transformer. However, when I create the CSV it appears with the dictionary shape (key-value, key-value...).
What I want is the CSV to have one key and in the next line it's value, and that shape consecutively. Is there a way to add a \n or something like that to have it's key and value in one line?
Shape of the dictionary:
{'NavAb:/1126': 'TNIVESSFFTKFIIYLIVLNGITMGLETSKTFMQSFGVYTTLFNQIVITIFTIEIILRIYVHRISFFKDPWSLFDFFVVAISLVPTSSGFEILRVLRVLRLFRLVTAVPQMRKI', 'Shaker:/1656': 'SSQAARVVAIISVFVILLSIVIFCLETLEDEVPDITDPFFLIETLCIIWFTFELTVRFLACPLNFCRDVMNVIDIIAIIPYFITTLNLLRVIRLVRVFRIFKLSRHSKGLQIL', .....
What I want in the CSV:
protein id
protein sequence
protein id
protein sequence
.....
The code I have for the moment:
def parse_file(input_file):
parsed_seqs = {}
curr_seq_id = None
curr_seq = []
for line in newfile:
line = line.strip()
line = line.replace('-', '')
if line.startswith(">"):
if curr_seq_id is not None:
parsed_seqs[curr_seq_id] = ''.join(curr_seq)
curr_seq_id = line[1:]
curr_seq = []
continue
curr_seq.append(line)
parsed_seqs[curr_seq_id] = ''.join(curr_seq)
return parsed_seqs
newfile = open("/content/drive/MyDrive/Colab Notebooks/seqs.fasta")
parsed_seqs = parse_file(newfile)
with open('sequences.csv', 'w', newline='') as f:
w = csv.DictWriter(f, parsed_seqs.keys())
w.writeheader()
w.writerow(parsed_seqs)
The shape I want:
New shape:
To get CSV output with 2 columns, one for Protein ID and one for Protein Sequence, you can do this.
parsed_seqs = {
'NavAb:/1126': 'TNIVESS',
'Shaker:/1656': 'SSQAARVV'
}
column_names = ["Protein ID", "Protein Sequence"]
with open('sequences.csv', 'w', newline='') as f:
w = csv.writer(f, column_names)
w.writerow(column_names)
w.writerows(parsed_seqs.items())
Output:
Protein ID,Protein Sequence
NavAb:/1126,TNIVESS
Shaker:/1656,SSQAARVV
As an aside, the csv.DictWriter class works well when you have a list of dictionaries, where each dictionary is structured like {"column1": "value1", "column2": "value2"}. For example
parsed_seqs = [
{"ID": "NavAb", "Seq": "TINVESS"},
{"ID": "Shaker", "Seq": "SSQAARVV"}
]
with open("sequences.fa", "wt", newline="") as fd:
wrtr = csv.DictWriter(fd, ["ID", "Seq"])
wrtr.writeheader()
wrtr.writerows(parsed_seqs)

Python list: why append the same value

I create these for loops and at the end I append the value of a dict to a list, but every time I do append, it appends the same value in my list and don't know why.
My code:
data_list = []
with open(arquivo, encoding="utf-8") as f:
for jsonObj in f:
data = json.loads(jsonObj)
count = count + 1
for item in data:
if item in de:
indice = de.index(item)
data_new[para[indice]] = data[item]
else:
data_new[item] = data[item]
data_list.append(data_new)
print(data_list)
Expected:
[{"a": 1}, {"b":2}]
Real result:
[{"a": 1}, {"a":1}]
My file is something like that:
{"Line1":"value","Key1":0}
{"Line2":"value","Key2":0}
{"Line3":"value","Key3":0}
And one thing that I discovered: if I write a file instead data_list.append() it works, in the same indent. So in data_list.append(data_new) I changed to call write_file(data_new):
write_file(data_new):
with open('test.json', 'a') as f:
json.dump(data, f)
f.write('\n')
Obs.:
printing and debuging I can see the value of data_list (list) and in the first append, the value is correct:
{"Line1":"value","Key1":0}
in the second time inside the loop the value of data_new is "Line2" (correctly) but when I do data_list.append(data_new) it changes all my array to the very last readed and so on:
{"Line2":"value","Key2":0}
{"Line2":"value","Key2":0}
_
def write_file(data):
with open('data-transfer-out-teste.json', 'a', encoding='utf-8') as f:
json.dump(data, f)
f.write('\n')
def main():
config = ler_config()
de = config["de"]
para = config["para"]
data_new = {}
arquivo = "entrada.json"
with open(arquivo, encoding="utf-8") as f:
for jsonObj in f:
data = json.loads(jsonObj)
count = count + 1
# item = chaves no arquivo lido
# indice = posicao da chave no array "de"
for item in data:
if item in de:
indice = de.index(item)
data_new[para[indice]] = data[item]
else:
data_new[item] = data[item]
write_file(data_new)
I may be wrong, but it seems that you need to add an indentation here:
data_list = []
with open(arquivo, encoding="utf-8") as f:
for jsonObj in f:
data = json.loads(jsonObj)
count = count + 1
for item in data:
if item in de:
indice = de.index(item)
data_new[para[indice]] = data[item]
else:
data_new[item] = data[item]
data_list.append(data_new) ### here to add
print(data_list)

Python json file modifying

I have a problem.Every timei do this it will just replace the last users wallet.
data = {}
usern = ctx.message.author
usern = str(usern)
data[usern] = []
data[usern].append({
"money": 0
})
Got and answer for it:
with open('config.json', 'r') as infile:
data = json.load(infile) # load from existing
data1 = data
usern = ctx.message.author
usern = str(usern)
data1[usern] = []
data1[usern].append({
"money": 0
})
else: # no file, start from scratch
data = {}
data = {}
usern = ctx.message.author
usern = str(usern)
data[usern] = []
data[usern].append({
"money": 0
})
with open('config.json', 'w') as outfile:
json.dump(data, outfile)
Check whether the file exists & load the existing json structure from it. Change the beginning part of your program to this:
from os.path import isfile
if isfile('config.json'): # check if file exists
with open('config.json', 'r') as infile:
data = json.load(infile) # load from existing
else: # no file, start from scratch
data = {}

Summing values from duplicate keys in a CSV file without panda

I have a large dataset that looks like the following
party,cp,qualifier,amount
ABC,DEF,GOOGLE_2,100
ABC,DEF,GOOGLE_2,200
GHI,JKL,FACEBOOK_1,500
GHI,JKL,FACEBOOK_1,-600
I would like to output :
ABC,DEF,GOOGLE,300
GHI,JKL,FACEBOOK,-100
Here is my python code so far:
headers = ["valuation_date","party_group_name","type","party_name","cp_group_name","cp_name","qualifier","amount"]
data = {}
with open(t1file,'rb') as f:
reader = csv.reader(f)
headers = reader.next()
for row in reader:
party = row[headers.index('party')]
cp = row[headers.index('cp')]
qualifier = row[headers.index('qualifier')]
amount = row[headers.index('amount')]
if row[headers.index('type')] == "Equity":
new_qualifier = qualifier.split("_")[0]
if party in data.keys():
if cp in data.keys():
if new_qualifier in data.keys():
data[party][cp][new_qualifier] += float(amount)
else:
data[party][cp][qualifier][amount] = data[party][cp][new_qualifier][amount]
else:
data[cp] = cp
else:
data[party] = party
When I run the above code I get the following error:
data[party][cp][qualifier][amount] = data[party][cp][new_qualifier][amount]
TypeError: string indices must be integers, not str
Very rusty with python apologize if it's glaringly obivous but any insights as to what i'm doing wrong ?
Thanks !
you can use pandas.drop_duplicates to drop duplicates of multiple columns and combine it with pandas.groupby() & sum to get the desired result
>>>import pandas as pd
>>>#read file using pandas.read_csv()
>>>df
party cp qualifier amount
0 ABC DEF GOOGLE_2 100
1 ABC DEF GOOGLE_2 200
2 GHI JKL FACEBOOK_1 500
3 GHI JKL FACEBOOK_1 -600
>>>df['Total'] = df.groupby(['party','cp','qualifier'])['amount'].transform('sum')
>>>print(df.drop_duplicates(subset=['party','cp','qualifier'], keep='last'))
party cp qualifier amount Total
1 ABC DEF GOOGLE_2 200 300
3 GHI JKL FACEBOOK_1 -600 -100
Below
from collections import defaultdict
PARTY_IDX = 0
CP_IDX = 1
QUALIFIER_IDX = 2
AMOUNT_IDX = 3
data = defaultdict(int)
with open('del-me.csv') as f:
lines = [l.strip() for l in f.readlines()]
for idx, line in enumerate(lines):
if idx > 0:
fields = line.split(',')
party = fields[PARTY_IDX]
cp = fields[CP_IDX]
qualifier = fields[QUALIFIER_IDX]
qualifier = qualifier[:qualifier.find('_')]
key = ','.join([party, cp, qualifier])
amount = int(fields[AMOUNT_IDX])
data[key] += amount
with open('out.csv', 'w') as f:
for k, v in data.items():
f.write('{},{}\n'.format(k, v))
del-me.csv
party,cp,qualifier,amount
ABC,DEF,GOOGLE_2,100
ABC,DEF,GOOGLE_2,200
GHI,JKL,FACEBOOK_1,500
GHI,JKL,FACEBOOK_1,-600
out.csv
ABC,DEF,GOOGLE,300
GHI,JKL,FACEBOOK,-100
You have already enough answers, but let me correct your own code to help you derive the answer and understand the original issue:
import csv as csv
headers = ["valuation_date","party_group_name","party_name","cp_group_name","cp_name","qualifier","amount"]
data = {}
with open('test_data.csv','rt', encoding='utf-8') as f:
reader = csv.reader(f)
headers = next(reader)
for row in reader:
party = row[headers.index('party')]
cp = row[headers.index('cp')]
qualifier = row[headers.index('qualifier')]
amount = row[headers.index('amount')]
if row[headers.index('type')] == "Equity":
new_qualifier = qualifier.split("_")[0]
if party in data.keys():
cp_ = data[party]
if cp in cp_.keys():
qualifier_ = data[party][cp]
if new_qualifier in qualifier_.keys():
data[party][cp][new_qualifier] += float(amount)
else:
data[party][cp][qualifier][amount] = {}
else:
data[cp] = {}
else:
data[party] = {}
data[party][cp] = {}
data[party][cp][qualifier.split("_")[0]] = float(amount)
print(data)
This gives you
{'ABC': {'DEF': {'GOOGLE': 300.0}}, 'GHI': {'JKL': {'FACEBOOK': -100.0}}}
The problem was how you were populating your dictionary and how you were accessing it.
In order to simplify things, you might use just one key for the dict which is composed out of the identifying parts of a given line.
You might have to extract values by the header names like you already did. The following is based on the specified input. rsplit is used to split the string once at the end in order to use the party,cp,qualifier combination as a key and extract the amount.
def sumUp():
d = {}
with open(t1file,'rb') as f:
for line in f:
if 'party' in line:
continue # skip header
key, value = line.rsplit(',', 1) # split once at the end
d[key] = d[key] + int(value) if key in d else int(value)
You can do it like this:
from csv import DictReader, DictWriter
map_dic = dict()
with open('test1.csv', 'r') as fr:
csv_reader = DictReader(fr, delimiter=',')
for line in csv_reader:
key = '{}_{}_{}'.format(line['party'], line['cp'], line['qualifier'])
if key not in map_dic.keys():
map_dic[key] = {'party': line['party'], 'cp': line['cp'], 'qualifier': line['qualifier'], 'amount': int(line['amount'])}
else:
map_dic[key]['amount'] = map_dic[key]['amount'] + int(line['amount'])
with open('test2.csv', 'w') as csvfile:
writer = DictWriter(csvfile, fieldnames=['party', 'cp', 'qualifier', 'amount'])
writer.writeheader()
for key, data in map_dic.items():
writer.writerow(data)

Turning Text file to dictionary

I have a text file that has following structure:
mom:2
dad:3
mom:4
dad:2
me:4
And I need to make a dictionary that would display each name only once, but adding the numeric values together. In this case it would look like this:
{dad':5, 'me':4, 'mom':6}
How I should approach this problem?
I've tried
d = {}
try:
file = open(file.txt, "r")
for line in file:
(a, b) = line.split(":")
d[a] = float(b)
except IOError:
print()
but i haven't found a way to count up the values.
with open('file.txt', 'r') as f:
fp = f.readlines()
t = [l.strip().split(':') for l in fp if l != '\n']
d = {}
for l in t:
d[l[0]] = d.setdefault(l[0], 0) + int(l[1])

Categories

Resources