spacy remove only org and person names - python

I have written the below function which removes all named entities from text. How could I modify it to remove only org and person names? I don't want to remove 6 from $6 from below. Thanks
import spacy
sp = spacy.load('en_core_web_sm')
def NER_removal(text):
document = sp(text)
text_no_namedentities = []
ents = [e.text for e in document.ents]
for item in document:
if item.text in ents:
pass
else:
text_no_namedentities.append(item.text)
return (" ".join(text_no_namedentities))
NER_removal("John loves to play at Sofi stadium at 6.00 PM and he earns $6")
'loves to play at stadium at 6.00 PM and he earns $'

I think item.ent_type_ will be useful here.
import spacy
sp = spacy.load('en_core_web_sm')
def NER_removal(text):
document = sp(text)
text_no_namedentities = []
# define ent types not to remove
ent_types_to_stay = ["MONEY"]
ents = [e.text for e in document.ents]
for item in document:
# add condition to leave defined ent types
if all((item.text in ents, item.ent_type_ not in ent_types_to_stay)):
pass
else:
text_no_namedentities.append(item.text)
return (" ".join(text_no_namedentities))
print(NER_removal("John loves to play at Sofi stadium at 6.00 PM and he earns $6"))
# loves to play at Sofi stadium at 6.00 PM and he earns $ 6

Related

Named Entity Extraction

I am trying to extract list of persons using Stanford Named Entity Recognizer (NER) in Python NLTK. Code and obtained output is like this
Code
from nltk.tag import StanfordNERTagger
st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')
sent = 'joel thompson tracy k smith new work world premierenew york philharmonic commission'
strin = sent.title()
value = st.tag(strin.split())
def get_continuous_chunks(tagged_sent):
continuous_chunk = []
current_chunk = []
for token, tag in tagged_sent:
if tag != "O":
current_chunk.append((token, tag))
else:
if current_chunk: # if the current chunk is not empty
continuous_chunk.append(current_chunk)
current_chunk = []
# Flush the final current_chunk into the continuous_chunk, if any.
if current_chunk:
continuous_chunk.append(current_chunk)
return continuous_chunk
named_entities = get_continuous_chunks(value)
named_entities_str = [" ".join([token for token, tag in ne]) for ne in named_entities]
print(named_entities_str)
Obtained Output
[('Joel Thompson Tracy K Smith New Work World Premierenew York Philharmonic Commission',
'PERSON')]
Desired Output
Person 1: Joel Thompson
Person 2: Tracy K Smith
Data : New Work World Premierenew York Philharmonic Commission

Convert in utf16

I am crawling several websites and extract the names of the products. In some names there are errors like this:
Malecon 12 Jahre 0,05 ltr.<br>Reserva Superior
Bols Watermelon Lik\u00f6r 0,7l
Hayman\u00b4s Sloe Gin
Ron Zacapa Edici\u00f3n Negra
Havana Club A\u00f1ejo Especial
Caol Ila 13 Jahre (G&M Discovery)
How can I fix that?
I am using xpath and re.search to get the names.
In every Python file, this is the first code: # -*- coding: utf-8 -*-
Edit:
This is the sourcecode, how I get the information.
if '"articleName":' in details:
closer_to_product = details.split('"articleName":', 1)[1]
closer_to_product_2 = closer_to_product.split('"imageTitle', 1)[0]
if debug_product == 1:
print('product before try:' + repr(closer_to_product_2))
try:
found_product = re.search(f'{'"'}(.*?)'f'{'",'}'closer_to_product_2).group(1)
except AttributeError:
found_product = ''
if debug_product == 1:
print('cleared product: ', '>>>' + repr(found_product) + '<<<')
if not found_product:
print(product_detail_page, found_product)
items['products'] = 'default'
else:
items['products'] = found_product
Details
product_details = information.xpath('/*').extract()
product_details = [details.strip() for details in product_details]
Where is a problem (Python 3.8.3)?
import html
strings = [
'Bols Watermelon Lik\u00f6r 0,7l',
'Hayman\u00b4s Sloe Gin',
'Ron Zacapa Edici\u00f3n Negra',
'Havana Club A\u00f1ejo Especial',
'Caol Ila 13 Jahre (G&M Discovery)',
'Old Pulteney \\u00b7 12 Years \\u00b7 40% vol',
'Killepitsch Kr\\u00e4uterlik\\u00f6r 42% 0,7 L']
for str in strings:
print( html.unescape(str).
encode('raw_unicode_escape').
decode('unicode_escape') )
Bols Watermelon Likör 0,7l
Hayman´s Sloe Gin
Ron Zacapa Edición Negra
Havana Club Añejo Especial
Caol Ila 13 Jahre (G&M Discovery)
Old Pulteney · 12 Years · 40% vol
Killepitsch Kräuterlikör 42% 0,7 L
Edit Use .encode('raw_unicode_escape').decode('unicode_escape') for doubled Reverse Solidi, see Python Specific Encodings

posting more than one line of text on fb status [Python] [Facebook API]

I started this project in the school holidays as a way to keep practising and enhancing my python knowledge. To put it shortly the code is a facebook bot that randomly generates NBA teams, Players and positions which should look like this when run.
Houston Rockets
[PG] Ryan Anderson
[PF] Michael Curry
[SF] Marcus Morris
[C] Bob Royer
[SF] Brian Heaney
I'm currently having trouble when it comes to posting my code to my facebook page where instead of posting 1 team and 5 players/positions the programme will only post 1 single player like this
Ryan Anderson
Here is my code
import os
import random
import facebook
token = "...."
fb = facebook.GraphAPI(access_token = token)
parent_dir = "../NBAbot"
os.chdir(parent_dir)
file_name = "nba_players.txt"
def random_position():
"""Random Team from list"""
position = ['[PG]','[SG]','[SF]','[PF]','[C]',]
random.shuffle(position)
position = position.pop()
return(position)
def random_team():
"""Random Team from list"""
Team = ['Los Angeles Lakers','Golden State Warriors','Toronto Raptors','Boston Celtics','Cleveland Cavaliers','Houston Rockets','San Antonio Spurs','New York Knicks','Chicago Bulls','Minnesota Timberwolves','Philadelphia 76ers','Miami Heat','Milwaukee','Portland Trail Blazers','Dallas Mavericks','Phoenix Suns','Denver Nuggets','Utah Jazz','Indiana Pacers','Los Angeles Clippers','Washington Wizards','Brooklyn Nets','New Orleans Pelicans','Sacramento Kings','Atlanta Hawks','Detroit Pistons','Memphis Grizzlies','Charlotte Hornets','Orlando Magic']
random.shuffle(Team)
Team = Team.pop()
return(Team)
def random_player(datafile):
read_mode = "r"
with open (datafile, read_mode) as read_file:
the_line = read_file.readlines()
return(random.choice(the_line))
def main():
return(
random_team(),
random_position(),
random_player(file_name),
random_position(),
random_player(file_name),
random_position(),
random_player(file_name),
random_position(),
random_player(file_name),
random_position(),
random_player(file_name))
fb.put_object(parent_object="me", connection_name='feed', message=main())
any help is appreciated.

Join whole word by its Tag Python

let say i have this sentences:
His/O name/O is/O Petter/Name Jack/Name and/O his/O brother/O name/O is/O
Jonas/Name Van/Name Dame/Name
How can i get result like this:
Petter Jack, Jonas Van Dame.
So far i've already tried this, but still its just join 2 word :
import re
pattern = re.compile(r"\w+\/Name)
sent = sentence.split()
for i , w in sent:
if pattern.match(sent[i]) != None:
if pattern.match(sent[i+1]) != None:
#....
#join sent[i] and sent[i+1] element
#....
Try something like this
pattern = re.compile(r"((\w+\/Name\s*)+)")
names = pattern.findall(your_string)
for name in names:
print(''.join(name[0].split('/Name')))
I'm thinking about a two-phase solution
r = re.compile(r'\w+\/Name(?:\ \w+\/Name)*')
result = r.findall(s)
# -> ['Petter/Name Jack/Name', 'Jonas/Name Van/Name Dame/Name']
for r in result:
print(r.replace('/Name', ''))
# -> Petter Jack
# -> Jonas Van Dame

html scraping using python topboxoffice list from imdb website

URL: http://www.imdb.com/chart/?ref_=nv_ch_cht_2
I want you to print top box office list from above site (all the movies' rank, title, weekend, gross and weeks movies in the order)
Example output:
Rank:1
title: godzilla
weekend:$93.2M
Gross:$93.2M
Weeks: 1
Rank: 2
title: Neighbours
This is just a simple way to extract those entities by BeautifulSoup
from bs4 import BeautifulSoup
import urllib2
url = "http://www.imdb.com/chart/?ref_=nv_ch_cht_2"
data = urllib2.urlopen(url).read()
page = BeautifulSoup(data, 'html.parser')
rows = page.findAll("tr", {'class': ['odd', 'even']})
for tr in rows:
for data in tr.findAll("td", {'class': ['titleColumn', 'weeksColumn','ratingColumn']}):
print data.get_text()
P.S.-Arrange according to your will.
There is no need to scrape anything. See the answer I gave here.
How to scrape data from imdb business page?
The below Python script will give you, 1) List of Top Box Office movies from IMDb 2) And also the List of Cast for each of them.
from lxml.html import parse
def imdb_bo(no_of_movies=5):
bo_url = 'http://www.imdb.com/chart/'
bo_page = parse(bo_url).getroot()
bo_table = bo_page.cssselect('table.chart')
bo_total = len(bo_table[0][2])
if no_of_movies <= bo_total:
count = no_of_movies
else:
count = bo_total
movies = {}
for i in range(0, count):
mo = {}
mo['url'] = 'http://www.imdb.com'+bo_page.cssselect('td.titleColumn')[i][0].get('href')
mo['title'] = bo_page.cssselect('td.titleColumn')[i][0].text_content().strip()
mo['year'] = bo_page.cssselect('td.titleColumn')[i][1].text_content().strip(" ()")
mo['weekend'] = bo_page.cssselect('td.ratingColumn')[i*2].text_content().strip()
mo['gross'] = bo_page.cssselect('td.ratingColumn')[(i*2)+1][0].text_content().strip()
mo['weeks'] = bo_page.cssselect('td.weeksColumn')[i].text_content().strip()
m_page = parse(mo['url']).getroot()
m_casttable = m_page.cssselect('table.cast_list')
flag = 0
mo['cast'] = []
for cast in m_casttable[0]:
if flag == 0:
flag = 1
else:
m_starname = cast[1][0][0].text_content().strip()
mo['cast'].append(m_starname)
movies[i] = mo
return movies
if __name__ == '__main__':
no_of_movies = raw_input("Enter no. of Box office movies to display:")
bo_movies = imdb_bo(int(no_of_movies))
for k,v in bo_movies.iteritems():
print '#'+str(k+1)+' '+v['title']+' ('+v['year']+')'
print 'URL: '+v['url']
print 'Weekend: '+v['weekend']
print 'Gross: '+v['gross']
print 'Weeks: '+v['weeks']
print 'Cast: '+', '.join(v['cast'])
print '\n'
Output (run in terminal):
parag#parag-innovate:~/python$ python imdb_bo_scraper.py
Enter no. of Box office movies to display:3
#1 Cinderella (2015)
URL: http://www.imdb.com/title/tt1661199?ref_=cht_bo_1
Weekend: $67.88M
Gross: $67.88M
Weeks: 1
Cast: Cate Blanchett, Lily James, Richard Madden, Helena Bonham Carter, Nonso Anozie, Stellan Skarsgård, Sophie McShera, Holliday Grainger, Derek Jacobi, Ben Chaplin, Hayley Atwell, Rob Brydon, Jana Perez, Alex Macqueen, Tom Edden
#2 Run All Night (2015)
URL: http://www.imdb.com/title/tt2199571?ref_=cht_bo_2
Weekend: $11.01M
Gross: $11.01M
Weeks: 1
Cast: Liam Neeson, Ed Harris, Joel Kinnaman, Boyd Holbrook, Bruce McGill, Genesis Rodriguez, Vincent D'Onofrio, Lois Smith, Common, Beau Knapp, Patricia Kalember, Daniel Stewart Sherman, James Martinez, Radivoje Bukvic, Tony Naumovski
#3 Kingsman: The Secret Service (2014)
URL: http://www.imdb.com/title/tt2802144?ref_=cht_bo_3
Weekend: $6.21M
Gross: $107.39M
Weeks: 5
Cast: Adrian Quinton, Colin Firth, Mark Strong, Jonno Davies, Jack Davenport, Alex Nikolov, Samantha Womack, Mark Hamill, Velibor Topic, Sofia Boutella, Samuel L. Jackson, Michael Caine, Taron Egerton, Geoff Bell, Jordan Long

Categories

Resources