Match two sets (/lists) in Python - python

I have two sets that are like below
Set A:
(['African American and Japanese', 'Indian', 'Chinese'])
Set B:
(['African', 'American', 'African American', 'Chinese', 'Russian'])
I want the output to be (['African American', 'Chinese']) but my script gives me either just Chinese or African, American, Chinese (splits African and American, I know that's how my script is, but am not sure how to edit).
I tried this so far.
import csv
alist, blist = [], []
with open("sample.csv", "rb") as fileA:
reader = csv.reader(fileA, delimiter=',')
for row in reader:
for row_str in row:
alist.append(row_str)
#alist = alist.strip().split() #If I use this, it also prints African, but doesn't print African American.
with open("ethnicity.csv", "rb") as fileB:
reader = csv.reader(fileB, delimiter='\n')
for row in reader:
blist += row
blist = [x.lower() for x in blist]
first_set = set(alist)
second_set = set(blist)
print [s for s in first_set if second_set in s]
EDIT:
Elements in SetA are not always separated by "and", it could be anything else or just a space.

You can rearrange list i.e. split the list item when it contains "and" as substring
Then use intersection method of set to get common items from both list.
code:
def convert(input):
output = []
for i in input:
for j in i.split("and"):
output.append(j.strip())
return output
a = ['African American and Japanese', 'Indian', 'Chinese']
b = ['African American', 'Chinese']
a = convert(a)
print a
b = convert(b)
print set(a).intersection(set(b))
Output:
set(['African American', 'Chinese'])
Is this helpful ?

If it could be any string (spaces included) separating the words, you can do something like this:
import re
sep = ' ; '
_a = sep.join(re.split(' [a-z]* ', sep.join(a)))
_b = sep.join(re.split(' [a-z]* ', sep.join(b)))
set(_b.split(sep)).intersection(_a.split(sep))
It won't work when ; is separating two words in your lists... but I think it does handle all cases when you have a non-capatalized word separator.

Related

Removing words which are common to strings present in two lists

I have two list of strings
data_1 = ['The art is performed by james john.', 'art is quite silent']
data_2 = ['The art is performed by hans.', 'art is very quite silent']
I want to remove common words from strings present in list and return two separate lists
result_1 = ['james john','']
result_2 = ['hans', 'very']
I tried this way
print([' '.join(set(i.split()).difference(set(data_1))) for i in data_2])
How to obtain a result like result_1 and result_2
You could try using numpy's setdiff1d function. Like:
difference_1 = [" ".join(list(np.setdiff1d(np.array(x.split()), np.array(y.split())))) for x, y in zip(data_1, data_2)]
Using set.diference() also should work:
difference_1 = [" ".join(set(x.split()).difference(set(z.split()))) for x, z in zip(data_1, data_2)]
First tokenize the sentences using nltk
from nltk import word_tokenize
def list_tokenize(data):
return [word_tokenize(sentence) for sentence in data]
then get the common words
def get_common_words(data_1_tokenized,data_2_tokenized):
return [
list(set.intersection(set(sentence_1), set(sentence_2)))
for sentence_1, sentence_2 in zip(data_1_tokenized, data_2_tokenized)
]
Then remove the common words
def remove_common_words(data, common_words):
result = []
for i in range(len(data)):
result.append(
" ".join([word for word in data[i] if word not in common_words[i]]))
return result
Combined function to get unique words
def get_unique(data_1,data_2):
data_1_tokenized = list_tokenize(data_1)
data_2_tokenized = list_tokenize(data_2)
common_words = get_common_words(data_1_tokenized,data_2_tokenized)
result1 = remove_common_words(data_1_tokenized,common_words)
result2 = remove_common_words(data_2_tokenized,common_words)
return result1,result2
final usage
data_1 = ['The art is performed by james john.', 'art is quite silent']
data_2 = ['The art is performed by hans.', 'art is very quite silent']
result1,result2 = get_unique(data_1,data_2)
Results
result1=['james john', '']
result2=['hans', 'very']

I want to get rid of all the special characters like [(' and im really stuck in how to

def trackItems():
cursor.execute("SELECT ItemsBought, COUNT(*) FROM Purchase GROUP BY ItemsBought")
stock = []
Graphs = cursor.fetchall()
print(Graphs)
separator = " "
f = open("Stock.txt", "w")
values = ','.join([str(i) for i in Graphs])
f.write(values)
Output
('DONT', 1),('MY', 2),('PLEASE', 2)
How can i get rid of opening and closing brackets and all the quotation marks. If anyone could help, it would be much appreciated
You can replace substrings from a string with "" (so it will remove it) using str.replace(substring, "") e.g.
"(abcd(.ad".replace("(", "") #output: abcd.ad
Then you can just write this string to the file.
Code:
def trackItems():
cursor.execute("SELECT ItemsBought, COUNT(*) FROM Purchase GROUP BY ItemsBought")
stock = []
Graphs = cursor.fetchall()
print(Graphs)
separator = " "
f = open("Stock.txt", "w")
values = ','.join([str(i) for i in Graphs]).replace("(", "").replace(")", "").replace("'", "")
f.write(values)
Rather than thinking about this as a problem about special characters, think about it as flattening a sequence (list) of subsequences (tuples, the rows) into a list of individual elements, which can then be joined.
You could do this with a for loop:
>>> flattened = []
>>> for row in Graphs:
... flattened.extend(row)
...
>>> flattened
['DONT', 1, 'MY', 2, 'PLEASE', 2]
but a list comprehension is more idiomatic
>>> Graphs = [('DONT', 1),('MY', 2),('PLEASE', 2)]
>>> values = ','.join([str(i) for j in Graphs for i in j])
>>> print(values)
DONT,1,MY,2,PLEASE,2

Creating lists from rows with different lengths in python

I am trying to create a list for each column in python of my data that looks like this:
399.75833 561.572000000 399.75833 561.572000000 a_Fe I 399.73920 nm
399.78316 523.227000000 399.78316 523.227000000
399.80799 455.923000000 399.80799 455.923000000 a_Fe I 401.45340 nm
399.83282 389.436000000 399.83282 389.436000000
399.85765 289.804000000 399.85765 289.804000000
The problem is that each row of my data is a different length. Is there anyway to format the remaining spaces of the shorter rows with a space so they are all the same length?
I would like my data to be in the form:
list one= [399.75833, 399.78316, 399.80799, 399.83282, 399.85765]
list two= [561.572000000, 523.227000000, 455.923000000, 389.436000000, 289.804000000]
list three= [a_Fe, " ", a_Fe, " ", " "]
This is the code I used to import the data into python:
fh = open('help.bsp').read()
the_list = []
for line in fh.split('\n'):
print line.strip()
splits = line.split()
if len(splits) ==1 and splits[0]== line.strip():
splits = line.strip().split(',')
if splits:the_list.append(splits)
You need to use izip_longest to make your column lists, since standard zip will only run till the shortest length in the given list of arrays.
from itertools import izip_longest
with open('workfile', 'r') as f:
fh = f.readlines()
# Process all the rows line by line
rows = [line.strip().split() for line in fh]
# Use izip_longest to get all columns, with None's filled in blank spots
cols = [col for col in izip_longest(*rows)]
# Then run your type conversions for your final data lists
list_one = [float(i) for i in cols[2]]
list_two = [float(i) for i in cols[3]]
# Since you want " " instead of None for blanks
list_three = [i if i else " " for i in cols[4]]
Output:
>>> print list_one
[399.75833, 399.78316, 399.80799, 399.83282, 399.85765]
>>> print list_two
[561.572, 523.227, 455.923, 389.436, 289.804]
>>> print list_three
['a_Fe', ' ', 'a_Fe', ' ', ' ']
So, your lines are either whitespace delimited or comma delimited, and if comma delimited, the line contains no whitespace? (note that if len(splits)==1 is true, then splits[0]==line.strip() is also true). That's not the data you're showing, and not what you're describing.
To get the lists you want from the data you show:
with open('help.bsp') as h:
the_list = [ line.strip().split() for line in h.readlines() ]
list_one = [ d[0] for d in the_list ]
list_two = [ d[1] for d in the_list ]
list_three = [ d[4] if len(d) > 4 else ' ' for d in the_list ]
If you're reading comma separated (or similarly delimited) files, I always recommend using the csv module - it handles a lot of edge cases that you may not have considered.

Finding the amount of lines and empty lines in a list of strings

I need to be able to find the number of lines and empty lines in a list of strings.
text = [
'Hi my name is bob',
'hi my name is jill',
'hi my name is john',
'hi my name jordan']
I have come up with
def stats(text: list):
for i in range(len(text)):
lines = (i + 1)
for i in text:
if i == '\n':
print(range(len(i)))
finding the amount of lines works but finding the amount of empty lines does not work
Do I need to use these methods?
result = []
.append()
also what methods could I use to be able to print out the avg characters per line and average character per non-empty line?
Maybe simply use list comprehension? Here is a demo:
>>> f = open('file')
>>> l = f.readlines()
>>> l
['my name is bob\n',
'\n',
'hi my name is jill\n',
'hi my name is john\n',
'\n',
'\n',
'hi my name jordan\n'] # there is 3 *empty lines* and 4 non-empty lines in this file
>>> len([i for i in l if i == '\n'])
3
>>> len([i for i in l if i != '\n'])
4
>>>
Simple version (that doesn't rely on the input even being a list; will work with any iterable):
def stats(lines):
empty = 0
for total, line in enumerate(lines, start=1):
empty += not line.rstrip('\r\n')
return total, empty

Find matching value in nested list from list

I read two .csv files like this.
ori = "all.csv"
det = "find.csv"
names = []
namesa = []
with open(det, "r") as cursor:
for row in cursor:
cells = row.split(",")
if len(cells) > 2:
b = cells[1]
c = b.split("-")
names.append(c[0])
with open(ori, "r") as rcursor1: #read the document
for trow in rcursor1: #read each row
row1 = trow.split(",") #split it by your seperator
namesa.append(row1)
Works just fine.
namesa is a nested list where every row from my .csv is a list (see example) while namescontains the values which I want to find in namesa.
If the value from names is in namesa, I want the whole "nested list part". So i.e.
#example
namesa = [[a,b,c,], [a1, b1, c1], [xy, cd, e2], [u1, i1, il], ...]
names = [a, u1,]
return = [[a1, b1, c1], [u1, i1, il], ...]
#or
namesa = [[john,bill,catherina,], [marti, alex, christoph], [ben, sherlock, london], [Bern, paris, Zürich], ...]
names = [sherlock, marti]
results = [[marti, alex, christoph], [ben, sherlock, london]]
Well, that does not work.
Thats what I tried so far:
#did not return any match
d = list([b for b in namesa if b in [a for a in names]])
print d
#did not return any match neither
for a in namesa:
for b in names:
if b in a:
print "match"
#well, that did not work neither
for a in namesa:
for b in names:
if a[5] == b:
print "match"
There are no matches coming back. I opened my two csv files in excel and searched "by hand" for matches which returned me results...
What am I doing wrong here? Working with python.
If you use .csv file I'd suggest you to use csv module.
I'd to this this way (I'm assuming that things you're looking for are in column 'surname'. If they are in different columns you can consider iterating by them, or doing name in row['surname'] or name in row['name'], depends on complication:
import csv
result = []
listFromCSV = []
names = ['alex','sherlock']
csvFile = open('yourFile.csv')
reader = csv.DictReader(csvFile)
fieldnames = reader.fieldnames
for row in reader:
listFromCSV.append(row)
csvFile.close()
for name in names:
for row in listFromCSV:
if name.strip() in row['surname']:
result.append(row)
And if you want to get rid of duplicates append break at the end of last for loop.
namesa = [['john', 'bill', 'catherina'], ['cat', 'dog', 'foo'], ['noodle', 'bob']]
names = ['john','foo']
Try this
for n in names:
for arr in namesa:
if n.strip() in ''.join(arr):
print arr
.strip because the values in your names list seem to have trailing spaces.
namesa = [['john','bill','catherina',], ['marti', 'alex', 'christoph'], ['ben', 'sherlock', 'london']]
names = ['sherlock', 'marti']
for i in namesa:
for j in names:
if j in i:
print i
OUTPUT
['marti', 'alex', 'christoph']
['ben', 'sherlock', 'london']

Categories

Resources