np.where() returns MemoryError - python

The number of np.where()'s I would assume is the issue since removing 1 will allow the function to work. I'm not aware of another way to edit a name other than an if else. I figured this would be faster. Mapping comes to mind as well, but I'm not sure how to return the names that are not changed. Any help understanding the best practice for this desired outcome would be very much appreciated!
# Takes names from various dataframes and websites and makes one name outcome
#
def player_name_filter(name):
name = name.str.title().str.replace('.', ' ', regex=True).str.strip()
name = np.where(name=='A J Greer','A.J. Greer',
np.where(name=='Alexis Lafreni?Re','Alexis Lafreniere',
np.where(name=='Alexis Lafrenière','Alexis Lafreniere',
np.where(name=='Alexandre Carrier', 'Alex Carrier',
np.where(name=='Alexander Burmistrov', 'Alex Burmistrov',
np.where(name=='Alexander Petrovic', 'Alex Petrovic',
np.where(name=='Alexander Edler', 'Alex Edler',
np.where(name=='Alexander Kerfoot', 'Alex Kerfoot',
np.where(name=='Alexander Nylander', 'Alex Nylander',
np.where(name=='Alexander Radulov', 'Alex Radulov',
np.where(name=='Alexander Steen', 'Alex Steen',
np.where(name=='Alexandre Texier', 'Alex Texier',
np.where(name=='Alexander Volkov', 'Alex Volkov',
np.where(name=='Alexander Wennberg', 'Alex Wennberg',
np.where(name=='Aaron Volpatt', 'Aaron Volpatti',
np.where(name=='Adam Cracknel', 'Adam Cracknell',
np.where(name=='Anze Kopitar', 'GOAT',
np.where(name=='B J Crombeen', 'B.J. Crombeen',
np.where(name=='C J Smith', 'C.J. Smith',
np.where(name=='Christopher Tanev', 'Chris Tanev',
np.where(name=='Colin White', 'Colin White2',
np.where(name=='Charlie Mcavoy', 'Charlie McAvoy',
np.where(name=='Casey Desmith', 'Casey DeSmith',
np.where(name=='Cal Petersen', 'Calvin Petersen',
np.where(name=='Calvin De Haan', 'Calvin de Haan',
np.where(name=='Cj Suess', 'C.J. Suess',
np.where(name=='Dj King', 'D.J. King',
np.where(name=='Erik Gustafsson', 'Erik Gustafsson2',
np.where(name=='Evgenii Dadonov', 'Evgeny Dadonov',
np.where(name=='Jake McCabe', 'Jake McCabe',
np.where(name=='Jacob Macdonald', 'Jacob MacDonald',
np.where(name=='Jacob de la Rose', 'Jacob De La Rose',
np.where(name=='Jean-Francois Berube', 'J-F Berube',
np.where(name=='Joseph Labate', 'Joseph LaBate',
np.where(name=='J T Brown', 'J.T. Brown',
np.where(name=='J T Compher', 'J.T. Compher',
np.where(name=='J C Beaudin', 'J.C. Beaudin',
np.where(name=='J T Miller', 'J.T. Miller',
np.where(name=='Jc Lipon', 'J.C. Lipon',
np.where(name=='Jt Wyman', 'J.T. Wyman',
np.where(name=='Martin St Louis', 'Martin St. Louis',
np.where(name=='Matthew Benning', 'Matt Benning',
np.where(name=='Maxime Comtois', 'Max Comtois',
np.where(name=='Max VÃf©ronneau', 'Max Veronneau',
np.where(name=='Max Lajoie', 'Maxime Lajoie',
np.where(name=='Michael Matheson', 'Mike Matheson',
np.where(name=='Mikhail Vorobyov', 'Mikhail Vorobyev',
np.where(name=='Mitchell Marner', 'Mitch Marner',
np.where(name=='Nicholas Caamano', 'Nick Caamano',
np.where(name=='Nicholas Suzuki', 'Nick Suzuki',
np.where(name=='P A Parenteau', 'P.A. Parenteau',
np.where(name=='P J Axelsson', 'P.J. Axelsson',
np.where(name=='P K Subban', 'P.K. Subban',
np.where(name=='R J Umberger', 'R.J. Umberger',
np.where(name=='Samuel Blais', 'Sammy Blais',
np.where(name=='Steve Santini', 'Steven Santini',
np.where(name=='Theodor Blueger', 'Teddy Blueger',
np.where(name=='Tim Gettinger', 'Timothy Gettinger',
np.where(name=='Tj Brodie', 'T.J. Brodie',
np.where(name=='Tj Brennan', 'T.J. Brennan',
np.where(name=='T J Brennan', 'T.J. Brennan',
np.where(name=='Tj Tynan', 'T.J. Tynan',
np.where(name=='T J Galiardi', 'T.J. Galiardi',
np.where(name=='T J Hensick', 'T.J. Hensick',
np.where(name=='T J Oshie', 'T.J. Oshie',
np.where(name=='Tony Deangelo', 'Tony DeAngelo',
np.where(name=='Anthony Deangelo', 'Tony DeAngelo',
np.where(name=='Vincent Hinostroza', 'Vinnie Hinostroza',
np.where(name=='Vitali Abramov', 'Vitaly Abramov',
np.where(name=="Logan O'Connor", "Logan O'Connor",
np.where(name=='Kurtis MacDermid', 'Kurtis MacDermid',
np.where(name=='Zachary Senyshyn', 'Zach Senyshyn',
np.where(name=='Christopher DiDomenico', 'Chris DiDomenico',
np.where(name=='Michael Cammalleri', 'Mike Cammalleri',
np.where(name=='Nicholas Shore', 'Nick Shore',
np.where(name=='Pat Maroon', 'Patrick Maroon',
np.where(name=='Ryan Macinnis', 'Ryan MacInnis',
np.where(name=='Tony Deangelo', 'Tony DeAngelo',
np.where(name=='Mackenzie Maceachern', 'Mackenzie MacEachern',
np.where(name=='Alex Debrincat', 'Alex DeBrincat',
np.where(name=='Samuel Montembeault', 'Sam Montembeault',
np.where(name=='Danny Taylor', 'Daniel Taylor',
np.where(name=='Pierre-Alexandr Parenteau', 'PA Parenteau',
np.where(name=='Christian Wolanin', 'Christian Wolanin',
np.where(name=="Dylan Sikura ", "Dylan Sikura",
np.where(name=='Troy Terry ', 'Troy Terry',
np.where(name=='Viktor Antipin', 'Victor Antipin',
np.where(name=='Zach Aston-reese', 'Zach Aston-Reese',
np.where(name=='Max Lagace', 'Maxime Lagace',
name)))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))
return name

When you are dealing with more than two choices, use np.select.
conditions = [df['name'] == 'A J Greer', df['name'] == 'Alexis Lafrenière', ...]
choices = ['A.J. Greer', 'Alexis Lafreniere', ...]
df.select(conditions, choices, default = df.name)
You can also use dictionary as a lookup table
name_table = {
'A J Greer': 'A.J. Greer',
'Alexis Lafrenière': 'Alexis Lafreniere',
'B J Crombeen': 'B.J. Crombeen',
# and so on
}
df['name'].str.title().apply(lambda x: name_table.get(x, x))
# or
def player_name_filter(dataf , column_name):
dataf[column_name] = dataf[column_name].str.title().map(name_table)
return dataf
df = df.pipe(player_name_filter, column_name='name')
# problem NaN for players not in the tables

Related

Ho do I get a list from compounded names with strange characters in python?

How can I generate a list with regex in python for countries with compounded names?
names = ['Nizhniy Novgorod', 'Cần Thơ', 'Ba Beja', 'Bandar Bampung', 'Benin City', 'Ciudad Nezahualcóyotl', 'Biên Hòa', 'São Gonçalo', 'São Luís', 'New Orleans', 'Thủ Đức']
I was trying to do this but it returns all names:
import re
lst = []
for word in names:
if re.findall(r'[A-Z]\w+\b', word[0]) == re.findall(r'\b[A-Z]\w+', word[1]):
lst.append(word)
print(lst)
Output:
['Nizhniy Novgorod', 'Cần Thơ', 'Ba Beja', 'Bandar Bampung', 'Benin City', 'Ciudad Nezahualcóyotl', 'Biên Hòa', 'São Gonçalo', 'São Luís', 'New Orleans', 'Thủ Đức']
The desired output would be [Ba Beja, Bandar Bampung].
It is an exercise that's why I can only do it with the module re. Any help will be appreciate.
Ok - so I have two answers for you.
One that uses REGEX, and the other that doesn't.
Here is the REGEX version:
import re
names = ['Nizhniy Novgorod', 'Cần Thơ', 'Ba Beja', 'Bandar Bampung', 'Benin City', 'Ciudad Nezahualcóyotl', 'Biên Hòa', 'São Gonçalo', 'São Luís', 'New Orleans', 'Thủ Đức']
pattern = re.compile(r'^([A-zÀ-ứ])[A-zÀ-ứ]*\s\1[A-zÀ-ứ]*$')
lst = []
for line in names:
if re.search(pattern, line):
lst.append(line)
print(lst)
OUTPUT:
['Nizhniy Novgorod', 'Ba Beja', 'Bandar Bampung']
And here is the other answer that does not use Regex:
names = ['Nizhniy Novgorod', 'Cần Thơ', 'Ba Beja', 'Bandar Bampung', 'Benin City', 'Ciudad Nezahualcóyotl', 'Biên Hòa', 'São Gonçalo', 'São Luís', 'New Orleans', 'Thủ Đức']
lst = []
space = ' '
for line in names:
if space in line:
first, second = line.split(space)
if first[0] == second[0]:
lst.append(line)
print(lst)
OUTPUT:
['Nizhniy Novgorod', 'Ba Beja', 'Bandar Bampung']

Find the anagram pairs of from 2 lists and create a list of tuples of the anagrams

say I have two lists
list_1 = [ 'Tar', 'Arc', 'Elbow', 'State', 'Cider', 'Dusty', 'Night', 'Inch', 'Brag', 'Cat', 'Bored', 'Save', 'Angel','bla', 'Stressed', 'Dormitory', 'School master','Awesoame', 'Conversation', 'Listen', 'Astronomer', 'The eyes', 'A gentleman', 'Funeral', 'The Morse Code', 'Eleven plus two', 'Slot machines', 'Fourth of July', 'Jim Morrison', 'Damon Albarn', 'George Bush', 'Clint Eastwood', 'Ronald Reagan', 'Elvis', 'Madonna Louise Ciccone', 'Bart', 'Paris', 'San Diego', 'Denver', 'Las Vegas', 'Statue of Liberty']
and
list_B = ['Cried', 'He bugs Gore', 'They see', 'Lives', 'Joyful Fourth', 'The classroom', 'Diagnose', 'Silent', 'Taste', 'Car', 'Act', 'Nerved', 'Thing', 'A darn long era', 'Brat', 'Twelve plus one', 'Elegant man', 'Below', 'Robed', 'Study', 'Voices rant on', 'Chin', 'Here come dots', 'Real fun', 'Pairs', 'Desserts', 'Moon starer', 'Dan Abnormal', 'Old West action', 'Built to stay free', 'One cool dance musician', 'Dirty room', 'Grab', 'Salvages', 'Cash lost in me', "Mr. Mojo Risin'", 'Glean', 'Rat', 'Vase']
What I am looking for is to find the anagram pairs of list_A in list_B. Create a list of tuples of the anagrams.
For one list I can do the following and generate the list of tuples, however, for two lists I need some assistance. Thanks in advance for the help!
What I have tried for one list,
from collections import defaultdict
anagrams = defaultdict(list)
for w in list_A:
anagrams[tuple(sorted(w))].append(w)
You can use a nested for loop, outer for the first list, inner for the second (also, use str.lower to make it case-insensitive):
anagram_pairs = [] # (w_1 from list_A, w_2 from list_B)
for w_1 in list_A:
for w_2 in list_B:
if sorted(w_1.lower()) == sorted(w_2.lower()):
anagram_pairs.append((w_1, w_2))
print(anagram_pairs)
Output:
[('Tar', 'Rat'), ('Arc', 'Car'), ('Elbow', 'Below'), ('State', 'Taste'), ('Cider', 'Cried'), ('Dusty', 'Study'), ('Night', 'Thing'), ('Inch', 'Chin'), ('Brag', 'Grab'), ('Cat', 'Act'), ('Bored', 'Robed'), ('Save', 'Vase'), ('Angel', 'Glean'), ('Stressed', 'Desserts'), ('School master', 'The classroom'), ('Listen', 'Silent'), ('The eyes', 'They see'), ('A gentleman', 'Elegant man'), ('The Morse Code', 'Here come dots'), ('Eleven plus two', 'Twelve plus one'), ('Damon Albarn', 'Dan Abnormal'), ('Elvis', 'Lives'), ('Bart', 'Brat'), ('Paris', 'Pairs'), ('Denver', 'Nerved')]
You are quite close with your current attempt. All you need to do is repeat the same process on list_B:
from collections import defaultdict
anagrams = defaultdict(list)
list_A = [ 'Tar', 'Arc', 'Elbow', 'State', 'Cider', 'Dusty', 'Night', 'Inch', 'Brag', 'Cat', 'Bored', 'Save', 'Angel','bla', 'Stressed', 'Dormitory', 'School master','Awesoame', 'Conversation', 'Listen', 'Astronomer', 'The eyes', 'A gentleman', 'Funeral', 'The Morse Code', 'Eleven plus two', 'Slot machines', 'Fourth of July', 'Jim Morrison', 'Damon Albarn', 'George Bush', 'Clint Eastwood', 'Ronald Reagan', 'Elvis', 'Madonna Louise Ciccone', 'Bart', 'Paris', 'San Diego', 'Denver', 'Las Vegas', 'Statue of Liberty']
list_B = ['Cried', 'He bugs Gore', 'They see', 'Lives', 'Joyful Fourth', 'The classroom', 'Diagnose', 'Silent', 'Taste', 'Car', 'Act', 'Nerved', 'Thing', 'A darn long era', 'Brat', 'Twelve plus one', 'Elegant man', 'Below', 'Robed', 'Study', 'Voices rant on', 'Chin', 'Here come dots', 'Real fun', 'Pairs', 'Desserts', 'Moon starer', 'Dan Abnormal', 'Old West action', 'Built to stay free', 'One cool dance musician', 'Dirty room', 'Grab', 'Salvages', 'Cash lost in me', "Mr. Mojo Risin'", 'Glean', 'Rat', 'Vase']
for w in list_A:
anagrams[tuple(sorted(w))].append(w)
for w in list_B:
anagrams[tuple(sorted(w))].append(w)
result = [b for b in anagrams.values() if len(b) > 1]
Output:
[['Cider', 'Cried'], ['The eyes', 'They see'], ['Damon Albarn', 'Dan Abnormal'], ['Bart', 'Brat'], ['Paris', 'Pairs']]
Another solution using dictionary:
out = {}
for word in list_A:
out.setdefault(tuple(sorted(word.lower())), []).append(word)
for word in list_B:
word_s = tuple(sorted(word.lower()))
if word_s in out:
out[word_s].append(word)
print(list(tuple(v) for v in out.values() if len(v) > 1))
Prints:
[
("Tar", "Rat"),
("Arc", "Car"),
("Elbow", "Below"),
("State", "Taste"),
("Cider", "Cried"),
("Dusty", "Study"),
("Night", "Thing"),
("Inch", "Chin"),
("Brag", "Grab"),
("Cat", "Act"),
("Bored", "Robed"),
("Save", "Vase"),
("Angel", "Glean"),
("Stressed", "Desserts"),
("School master", "The classroom"),
("Listen", "Silent"),
("The eyes", "They see"),
("A gentleman", "Elegant man"),
("The Morse Code", "Here come dots"),
("Eleven plus two", "Twelve plus one"),
("Damon Albarn", "Dan Abnormal"),
("Elvis", "Lives"),
("Bart", "Brat"),
("Paris", "Pairs"),
("Denver", "Nerved"),
]

How to check frequency of every unique value from pandas data-frame?

If I have a data-frame of 2000 and in which let say brand have 142 unique values and i want to count frequency of every unique value form 1 to 142.values should change dynamically.
brand=clothes_z.brand_name
brand.describe(include="all")
unique_brand=brand.unique()
brand.describe(include="all"),unique_brand
Output:
(count 2613
unique 142
top Mango
freq 54
Name: brand_name, dtype: object,
array(['Jack & Jones', 'TOM TAILOR DENIM', 'YOURTURN', 'Tommy Jeans',
'Alessandro Zavetti', 'adidas Originals', 'Volcom', 'Pier One',
'Superdry', 'G-Star', 'SIKSILK', 'Tommy Hilfiger', 'Karl Kani',
'Alpha Industries', 'Farah', 'Nike Sportswear',
'Calvin Klein Jeans', 'Champion', 'Hollister Co.', 'PULL&BEAR',
'Nike Performance', 'Even&Odd', 'Stradivarius', 'Mango',
'Champion Reverse Weave', 'Massimo Dutti', 'Selected Femme Petite',
'NAF NAF', 'YAS', 'New Look', 'Missguided', 'Miss Selfridge',
'Topshop', 'Miss Selfridge Petite', 'Guess', 'Esprit Collection',
'Vero Moda', 'ONLY Petite', 'Selected Femme', 'ONLY', 'Dr.Denim',
'Bershka', 'Vero Moda Petite', 'PULL & BEAR', 'New Look Petite',
'JDY', 'Even & Odd', 'Vila', 'Lacoste', 'PS Paul Smith',
'Redefined Rebel', 'Selected Homme', 'BOSS', 'Brave Soul', 'Mind',
'Scotch & Soda', 'Only & Sons', 'The North Face',
'Polo Ralph Lauren', 'Gym King', 'Selected Woman', 'Rich & Royal',
'Rooms', 'Glamorous', 'Club L London', 'Zalando Essentials',
'edc by Esprit', 'OYSHO', 'Oasis', 'Gina Tricot',
'Glamorous Petite', 'Cortefiel', 'Missguided Petite',
'Missguided Tall', 'River Island', 'INDICODE JEANS',
'Kings Will Dream', 'Topman', 'Esprit', 'Diesel', 'Key Largo',
'Mennace', 'Lee', "Levi's®", 'adidas Performance', 'jordan',
'Jack & Jones PREMIUM', 'They', 'Springfield', 'Benetton', 'Fila',
'Replay', 'Original Penguin', 'Kronstadt', 'Vans', 'Jordan',
'Apart', 'New look', 'River island', 'Freequent', 'Mads Nørgaard',
'4th & Reckless', 'Morgan', 'Honey punch', 'Anna Field Petite',
'Noisy may', 'Pepe Jeans', 'Mavi', 'mint & berry', 'KIOMI', 'mbyM',
'Escada Sport', 'Lost Ink', 'More & More', 'Coffee', 'GANT',
'TWINTIP', 'MAMALICIOUS', 'Noisy May', 'Pieces', 'Rest',
'Anna Field', 'Pinko', 'Forever New', 'ICHI', 'Seafolly', 'Object',
'Freya', 'Wrangler', 'Cream', 'LTB', 'G-star', 'Dorothy Perkins',
'Carhartt WIP', 'Betty & Co', 'GAP', 'ONLY Tall', 'Next', 'HUGO',
'Violet by Mango', 'WEEKEND MaxMara', 'French Connection'],
dtype=object))
As it is showing only frequency of Mango "54" because it is top frequency and I want every value frequency like what is the frequency of Jack & Jones, TOM TAILOR DENIM and YOURTURN and so on... and values should change dynamically.
You could simply do,
clothes_z.brand_name.value_counts()
This would list down the unique values and would give you the frequency of every element in that Pandas Series.
from collections import Counter
ll = [...your list of brands...]
c = Counter(ll)
# you can do whatever you want with your counted values
df = pd.DataFrame.from_dict(c, orient='index', columns=['counted'])

Return a list of similar authors

I'm trying to write a function that will return a list of items from a key from a key (if that makes sense). For example, here's a dictionary of authors, and similar authors.
authors = {
'Ray Bradbury': ['Harlan Ellison', 'Robert Heinlein', 'Isaac Asimov', 'Arthur Clarke'],
'Harlan Ellison': ['Neil Stephenson', 'Kurt Vonnegut', 'Richard Morgan', 'Douglas Adams'],
'Kurt Vonnegut': ['Terry Pratchett', 'Tom Robbins', 'Douglas Adams', 'Neil Stephenson', 'Jeff Vandemeer'],
'Thomas Pynchon': ['Isaac Asimov', 'Jorges Borges', 'Robert Heinlein'],
'Isaac Asimov': ['Stephen Baxter', 'Ray Bradbury', 'Arthur Clarke', 'Kurt Vonnegut', 'Neil Stephenson'],
'Douglas Adams': ['Terry Pratchett', 'Chris Moore', 'Kurt Vonnegut']
}
And the function I came up with is this:
def get_similar(author_list, author):
for item in author_list[author]:
return author_list[author]
Which only returns the items for the first key. I'd like it to return all of the similar authors, like this:
get_similar(authors, 'Harlan Ellison')
['Terry Pratchett', 'Tom Robbins', 'Douglas Adams', 'Neil Stephenson',
'Jeff Vandemeer','Terry Pratchett', 'Chris Moore', 'Kurt Vonnegut']
Where it finds the key given (author), looks at the items listed for that key, and then returns those key's items. In this case Harlan Ellison has four authors listed - Neil Stephenson, Kurt Vonnegut, Richard Morgan, and Douglas Adams. The function then looks up those authors, and returns the items listed for them - Kurt Vonnegut returns Terry Pratchett, Tom Robbins, Douglas Adams, Neil Stephenson, and Jeff Vandemeer, and Douglas Adams returns Terry Pratchett, Chris Moore, and Kurt Vonnegut,
Duplicates are fine, and I'd like it in alphabetical order (I assume you could just use a sort command at the end) Any help would be much appreciated, I'm stumped!
I think this is what you are looking for. Hopefully it gets you going.
authors = {'Ray Bradbury': ['Harlan Ellison', 'Robert Heinlein', 'Isaac Asimov', 'Arthur Clarke'], 'Harlan Ellison': ['Neil Stephenson', 'Kurt Vonnegut', 'Richard Morgan', 'Douglas Adams'], 'Kurt Vonnegut': ['Terry Pratchett', 'Tom Robbins', 'Douglas Adams', 'Neil Stephenson', 'Jeff Vandemeer'], 'Thomas Pynchon': ['Isaac Asimov', 'Jorges Borges', 'Robert Heinlein'], 'Isaac Asimov': ['Stephen Baxter', 'Ray Bradbury', 'Arthur Clarke', 'Kurt Vonnegut', 'Neil Stephenson'], 'Douglas Adams': ['Terry Pratchett', 'Chris Moore', 'Kurt Vonnegut']}
def get_similar(authors, author):
retVal = []
for k, v in authors.items():
if k == author:
for value in v:
retVal.append(value)
if value in authors:
for v2 in authors[value]:
retVal.append(v2)
return sorted(retVal)
get_similar(authors, "Harlan Ellison") returns
['Chris Moore',
'Douglas Adams',
'Douglas Adams',
'Jeff Vandemeer',
'Kurt Vonnegut',
'Kurt Vonnegut',
'Neil Stephenson',
'Neil Stephenson',
'Richard Morgan',
'Terry Pratchett',
'Terry Pratchett',
'Tom Robbins']
I'll leave it to you to figure out how to remove the duplicates.
You are very close but instead of returning after finding the first list of similar authors, you should store all of the authors you find in a list and then return them all after your for loop has finished:
def get_similar(author_list, author):
similar_authors = []
for item in author_list[author]:
if item in author_list:
similar_authors.extend(author_list[item])
return similar_authors
Notice that I also added an if statement to make sure that the item is in fact one of the keys in your dictionary so you don't get an error later on (for example: 'Neil Stephenson' is in the dictionary as a member of one of the values but is not a key).
EXTRA INFO:
(if you are interested)
Another option is to turn your function into a generator instead. This has the advantage of not having to store all the similar authors in a list and instead yields each author as it is found:
def get_similar2(author_list, author):
for item in author_list[author]:
if item in author_list:
for other_author in author_list[item]:
yield other_author
Or if you are using python 3.3+ you can simplify this a bit by using the yield from expression to get functionally the same code as in get_similar2:
def get_similar3(author_list, author):
for item in author_list[author]:
if item in author_list:
yield from author_list[item]
All three of the functions/generators above will give you the same results (just remember to get all the values yielded from the generators):
print(get_similar(authors, 'Harlan Ellison'))
['Terry Pratchett', 'Tom Robbins', 'Douglas Adams', 'Neil Stephenson', 'Jeff Vandemeer', 'Terry Pratchett', 'Chris Moore', 'Kurt Vonnegut']
print(list(get_similar2(authors, 'Harlan Ellison')))
['Terry Pratchett', 'Tom Robbins', 'Douglas Adams', 'Neil Stephenson', 'Jeff Vandemeer', 'Terry Pratchett', 'Chris Moore', 'Kurt Vonnegut']
print(list(get_similar3(authors, 'Harlan Ellison')))
['Terry Pratchett', 'Tom Robbins', 'Douglas Adams', 'Neil Stephenson', 'Jeff Vandemeer', 'Terry Pratchett', 'Chris Moore', 'Kurt Vonnegut']
Here's a simple solution using a set and list comprehension:
def get_similar(author_list, author):
similar = set(author_list.get(author, []))
similar.update(*[author_list.get(item, []) for item in similar])
return sorted(similar)
get_similar(authors, 'Harlan Ellison')
Output:
['Chris Moore', 'Douglas Adams', 'Jeff Vandemeer', 'Kurt Vonnegut',
'Neil Stephenson', 'Richard Morgan', 'Terry Pratchett', 'Tom Robbins']
What you're doing now will work the same way without the for loop - you're essentially just doing a single lookup and return that, hence you get only one entry. What you need to do instead is to do your lookup, find the authors and then do a lookup for each of those authors, then rinse and repeat... The easiest way to do that is to use a bit of recursion:
def get_similar(authors, author):
return [a for x in authors.pop(author, []) for a in [x] + get_similar(authors, x)]
get_similar(authors, 'Harlan Ellison')
# ['Neil Stephenson', 'Kurt Vonnegut', 'Terry Pratchett', 'Tom Robbins', 'Douglas Adams',
# 'Terry Pratchett', 'Chris Moore', 'Kurt Vonnegut', 'Neil Stephenson', 'Jeff Vandemeer',
# 'Richard Morgan', 'Douglas Adams']
Then all you need to do is to turn it into a set to get rid of the duplicates and then sort it, or if you don't mind a slight performance hit (due to recursion) you can do it right inside your function:
def get_similar(authors, author):
return sorted(set([a for x in authors.pop(author, []) for a in [x] + get_similar(authors, x)]))
# ['Chris Moore', 'Douglas Adams', 'Jeff Vandemeer', 'Kurt Vonnegut', 'Neil Stephenson', 'Richard Morgan', 'Terry Pratchett', 'Tom Robbins']
Keep in mind that this modifies your input dictionary to avoid infinite recursion, so if you want to keep your authors dictionary intact call the function as get_similar(authors.copy(), author).
What is happening is that functions only accept one return to fix this, return the full row without iterating
def get_similar(author_list, author):
return sorted(author_list[author])
I'd use recursion to find similar authors in this fashion. Come to find out, it is even more inconvenient (and dangerous and slower) to want to return duplicates.
authors = {'Ray Bradbury': ['Harlan Ellison', 'Robert Heinlein', 'Isaac Asimov', 'Arthur Clarke'], 'Harlan Ellison': ['Neil Stephenson',
'Kurt Vonnegut', 'Richard Morgan', 'Douglas Adams'], 'Kurt Vonnegut': ['Terry Pratchett', 'Tom Robbins', 'Douglas Adams',
'Neil Stephenson', 'Jeff Vandemeer'], 'Thomas Pynchon': ['Isaac Asimov', 'Jorges Borges', 'Robert Heinlein'], 'Isaac Asimov':
['Stephen Baxter', 'Ray Bradbury', 'Arthur Clarke', 'Kurt Vonnegut', 'Neil Stephenson'], 'Douglas Adams': ['Terry Pratchett', 'Chris Moore', 'Kurt Vonnegut']}
def get_similar(author_list, author, currentList=[]):
for similar in author_list[author]:
if similar not in currentList:
currentList.append(similar)
if similar in authors:
get_similar(author_list, author, currentList)
return sorted(currentList)
print(get_similar(authors, "Harlan Ellison"))
Returns:
['Douglas Adams', 'Kurt Vonnegut', 'Neil Stephenson', 'Richard Morgan']
One way is using list comprehension + itertools.chain
from itertools import chain
def get_similar(author_list, author):
return sorted(set(chain(*[v for k,v in authors.items() if k in authors[author]])))
get_similar(authors, 'Harlan Ellison')
#['Chris Moore', 'Douglas Adams', 'Jeff Vandemeer', 'Kurt Vonnegut', 'Neil Stephenson', 'Terry Pratchett', 'Tom Robbins']
I would not include parameter author in the output if that's one of the elements in a list value. You could use list comprehension:
def get_similar(author_list, author):
# Lists of similar authors
similar = [author_list[auth] for auth in author_list[author] if auth in author_list]
# Merge the lists and sort the authors. Do not include parameter author
return sorted(auth for sub in similar for auth in sub if auth != author)
authors = {
'Ray Bradbury': ['Harlan Ellison', 'Robert Heinlein', 'Isaac Asimov', 'Arthur Clarke'],
'Harlan Ellison': ['Neil Stephenson', 'Kurt Vonnegut', 'Richard Morgan', 'Douglas Adams'],
'Kurt Vonnegut': ['Terry Pratchett', 'Tom Robbins', 'Douglas Adams', 'Neil Stephenson', 'Jeff Vandemeer'],
'Thomas Pynchon': ['Isaac Asimov', 'Jorges Borges', 'Robert Heinlein'],
'Isaac Asimov': ['Stephen Baxter', 'Ray Bradbury', 'Arthur Clarke', 'Kurt Vonnegut', 'Neil Stephenson'],
'Douglas Adams': ['Terry Pratchett', 'Chris Moore', 'Kurt Vonnegut']
}
>>> get_similar(authors, 'Harlan Ellison')
['Chris Moore', 'Douglas Adams', 'Jeff Vandemeer', 'Kurt Vonnegut', 'Neil Stephenson', 'Terry Pratchett', 'Terry Pratchett', 'Tom Robbins']
>>> get_similar(authors, 'Ray Bradbury') # There's 'Ray Bradbury' in the values of 'Isaac Asimov'
['Arthur Clarke', 'Douglas Adams', 'Kurt Vonnegut', 'Kurt Vonnegut', 'Neil Stephenson', 'Neil Stephenson', 'Richard Morgan', 'Stephen Baxter']

XPath - extracting table data with irregular pattern

Extending an existing question and answer here, I am trying to extract player name and his position. The output would like:
playername, position
EJ Manuel, Quarterbacks
Tyrod Taylor, Quarterbacks
Anthony Dixon, Running backs
...
This is what I have done so far:
tree = html.fromstring(requests.get("https://en.wikipedia.org/wiki/List_of_current_AFC_team_rosters").text)
for h3 in tree.xpath("//table[#class='toccolours']//tr[2]"):
position = h3.xpath(".//b/text()")
players = h3.xpath(".//ul/li/a/text()")
print(position, players)
The above codes can deliver the following, but not in the format I need.
(['Quarterbacks', 'Running backs', 'Wide receivers', 'Tight ends', 'Offensive linemen', 'Defensive linemen', 'Linebackers', 'Defensive backs', 'Special teams', 'Reserve lists', 'Unrestricted FAs', 'Restricted FAs', 'Exclusive-Rights FAs'], ['EJ Manuel', 'Tyrod Taylor', 'Anthony Dixon', 'Jerome Felton', 'Mike Gillislee', 'LeSean McCoy', 'Karlos Williams', 'Leonard Hankerson', 'Marcus Easley', 'Marquise Goodwin', 'Percy Harvin', 'Dez Lewis', 'Walt Powell', 'Greg Salas', 'Sammy Watkins', 'Robert Woods', 'Charles Clay', 'Chris Gragg', "Nick O'Leary", 'Tyson Chandler', 'Ryan Groy', 'Seantrel Henderson', 'Cyrus Kouandjio', 'John Miller', 'Kraig Urbik', 'Eric Wood', 'T. J. Barnes', 'Marcell Dareus', 'Lavar Edwards', 'IK Enemkpali', 'Jerry Hughes', 'Kyle Williams', 'Mario Williams', 'Jerel Worthy', 'Jarius Wynn', 'Preston Brown', 'Randell Johnson', 'Manny Lawson', 'Kevin Reddick', 'Tony Steward', 'A. J. Tarpley', 'Max Valles', 'Mario Butler', 'Ronald Darby', 'Stephon Gilmore', 'Corey Graham', 'Leodis McKelvin', 'Jonathan Meeks', 'Merrill Noel', 'Nickell Robey', 'Sammy Seamster', 'Cam Thomas', 'Aaron Williams', 'Duke Williams', 'Dan Carpenter', 'Jordan Gay', 'Garrison Sanborn', 'Colton Schmidt', 'Blake Annen', 'Jarrett Boykin', 'Jonathan Dowling', 'Greg Little', 'Jacob Maxwell', 'Ronald Patrick', 'Cedric Reed', 'Cyril Richardson', 'Phillip Thomas', 'James Wilder, Jr.', 'Nigel Bradham', 'Ron Brooks', 'Alex Carrington', 'Cordy Glenn', 'Leonard Hankerson', 'Richie Incognito', 'Josh Johnson', 'Corbin Bryant', 'Stefan Charles', 'MarQueis Gray', 'Chris Hogan', 'Jordan Mills', 'Ty Powell', 'Bacarri Rambo', 'Cierre Wood'])
(['Quarterbacks', 'Running backs', 'Wide receivers', 'Tight ends', 'Offensive linemen', 'Defensive linemen', 'Linebackers', 'Defensive backs', 'Special teams', 'Reserve lists', 'Unrestricted FAs', 'Restricted FAs', 'Exclusive-Rights FAs'], ['Zac Dysert', 'Ryan Tannehill', 'Logan Thomas', 'Jay Ajayi', 'Jahwan Edwards', 'Damien Williams', 'Tyler Davis', 'Robert Herron', 'Greg Jennings', 'Jarvis Landry', 'DeVante Parker', 'Kenny Stills', 'Jordan Cameron', 'Dominique Jones', 'Dion Sims', 'Branden Albert', 'Jamil Douglas', "Ja'Wuan James", 'Vinston Painter', 'Mike Pouncey', 'Anthony Steen', 'Dallas Thomas', 'Billy Turner', 'Deandre Coleman', 'Quinton Coples', 'Terrence Fede', 'Dion Jordan', 'Earl Mitchell', 'Damontre Moore', 'Jordan Phillips', 'Ndamukong Suh', 'Charles Tuaau', 'Robert Thomas', 'Cameron Wake', 'Julius Warmsley', 'Jordan Williams', 'Neville Hewitt', 'Mike Hull', 'Jelani Jenkins', 'Terrell Manning', 'Chris McCain', 'Koa Misi', 'Zach Vigil', 'Walt Aikens', 'Damarr Aultman', 'Brent Grimes', 'Reshad Jones', 'Tony Lippett', 'Bobby McCain', 'Brice McCain', 'Tyler Patmon', 'Dax Swanson', 'Jamar Taylor', 'Matt Darr', 'John Denney', 'Andrew Franks', 'Louis Delmas', 'James-Michael Johnson', 'Rishard Matthews', 'Jacques McClendon', 'Lamar Miller', 'Matt Moore', 'Spencer Paysinger', 'Derrick Shelby', 'Kelvin Sheppard', 'Shelley Smith', 'Olivier Vernon', 'Michael Thomas', 'Brandon Williams', 'Shamiel Gary', 'Matt Hazel', 'Ulrick John', 'Jake Stoneburner'])
...
Any suggestions?
You can use nested loop for this task. First loop through the positions and then, for each position, loop through the corresponding players :
#loop through positions
for b in tree.xpath("//table[#class='toccolours']//tr[2]//b"):
#get current position text
position = b.xpath("text()")[0]
#get players that correspond to the current position
for a in b.xpath("following::ul[1]/li/a[not(*)]"):
#get current player text
player = a.xpath("text()")[0]
#print current position and player together
print(position, player)
Last part of the output :
.....
('Reserve lists', 'Chris Watt')
('Reserve lists', 'Eric Weddle')
('Reserve lists', 'Tourek Williams')
('Practice squad', 'Alex Bayer')
('Practice squad', 'Isaiah Burse')
('Practice squad', 'Richard Crawford')
('Practice squad', 'Ben Gardner')
('Practice squad', 'Michael Huey')
('Practice squad', 'Keith Lewis')
('Practice squad', 'Chuka Ndulue')
('Practice squad', 'Tim Semisch')
('Practice squad', 'Brad Sorensen')
('Practice squad', 'Craig Watts')

Categories

Resources