Do you know how to solve this in python? I would like to have a dataframe with data arranged in the correct column.
Thanks in advance!
Here is an example of a string from a dataframe.
' Huidigefuncties Michael Jordan 2015 - present Director Marketing & Indirect Channels, Ricoh Nederland 2010 - present Basketball Center, Center for Business-Expertise Loopbaan Michael Jordan 2012 - 2015 Director Marketing & Business Development, Ricoh Opleiding Michael Jordan 1988 - 1992 Marketing , Harvard '
Preferred result
type from to function organization
current 2015 present Director Marketing & Indirect Channels Ricoh Nederland
current 2010 present Owner & Consultant Basketball Center
old 2012 2015 Director Marketing & Business Development Ricoh
school 1988 1992 Marketing Harvard
Current df
Name Data
Michael Jordan ' Huidigefuncties Michael Jordan 2015 - present Director Marketing & Indirect Channels, Ricoh Nederland 2010 - present Basketball Center, Center for Business-Expertise Loopbaan Michael Jordan 2012 - 2015 Director Marketing & Business Development, Ricoh Opleiding Michael Jordan 1988 - 1992 Marketing , Harvard '
Well, this is a solution that I did for this problem
import pandas as pd
beautiful_data = 'Huidigefuncties Michael Jordan 2015 - present Director Marketing & Indirect Channels, Ricoh Nederland 2010 - present Basketball Center, Center for Business-Expertise Loopbaan Michael Jordan 2012 - 2015 Director Marketing & Business Development, Ricoh Opleiding Michael Jordan 1988 - 1992 Marketing , Harvard'
main_dict = {'type':[], 'from':[], 'to':[], 'function':[], 'organization': []}
data = beautiful_data.split(' ')
i = 0
huidi_index = data.index('Huidigefuncties')
loopbaan_index = data.index('Loopbaan')
ople_index = data.index('Opleiding')
# print(data)
while i < len(data):
if data[i] == 'Huidigefuncties':
line = ' '.join(data[i + 1: loopbaan_index])
i = loopbaan_index
print(line)
type_data = 'current'
elif data[i] == 'Loopbaan':
line = ' '.join(data[i + 1: ople_index])
i = ople_index
print(line)
type_data = 'old'
elif data[i] == 'Opleiding':
line = ' '.join(data[i+1: ])
i = len(data)
print(line)
type_data = 'school'
else:
i += 1
data_line = line.split('-')
if len(data_line) == 2:
print(type_data)
main_dict['type'].append(type_data)
from_data = data_line[0].strip().split(' ')[-1]
print(from_data)
main_dict['from'].append(from_data)
to_data = data_line[1].strip().split(' ')[0]
print(to_data)
main_dict['to'].append(to_data)
function_data = ' '.join(data_line[1].strip().split(' ')[1:-1])[:-1]
print(function_data)
main_dict['function'].append(function_data)
organization_data = data_line[1].split(',')[-1].strip()
print(organization_data)
main_dict['organization'].append(organization_data)
elif len(data_line) > 2:
j = 0
while j < len(data_line):
register_data = data_line[j:j+2]
if len(register_data) > 1:
if len(register_data[0].split(' ')) > 1 and len(register_data[1].split(' ')) > 1:
if j == 0:
print(register_data)
print('----------')
print(type_data)
main_dict['type'].append(type_data)
from_data = register_data[0].strip().split(' ')[-1]
print(from_data)
main_dict['from'].append(from_data)
to_data = register_data[1].strip().split(' ')[0]
print(to_data)
main_dict['to'].append(to_data)
function_org = register_data[1].strip().split(',')
function_data = ' '.join(function_org[0].split(' ')[1:])
print(function_data)
main_dict['function'].append(function_data)
org_data = ' '.join(function_org[1].split(' ')[:-1]).strip()
print(org_data)
main_dict['organization'].append(org_data)
print('-----------')
else:
print('-----------')
print(register_data)
print(type_data)
main_dict['type'].append(type_data)
from_data = register_data[0].strip().split(' ')[-1]
print(from_data)
main_dict['from'].append(from_data)
to_data = register_data[1].strip().split(' ')[0]
print(to_data)
main_dict['to'].append(to_data)
function_org = register_data[1].strip().split(',')
function_data = ' '.join(function_org[0].split(' ')[1:])
print(function_data)
main_dict['function'].append(function_data)
org_data = ' '.join(function_org[1].split(' ')).strip()
print(org_data)
main_dict['organization'].append(org_data)
print('-----------')
j += 1
df = pd.DataFrame(main_dict)
Tested
Related
I am trying to create variables location; contract items; contract code; federal aid using regex on the following text:
PAGE 1
BID OPENING DATE 07/25/18 FROM 0.2 MILES WEST OF ICE HOUSE 07/26/18 CONTRACT NUMBER 03-2F1304 ROAD TO 0.015 MILES WEST OF CONTRACT CODE 'A '
LOCATION 03-ED-50-39.5/48.7 DIVISION HIGHWAY ROAD 44 CONTRACT ITEMS
INSTALL SANDTRAPS AND PULLOUTS FEDERAL AID ACNH-P050-(146)E
PAGE 1
BID OPENING DATE 07/25/18 IN EL DORADO COUNTY AT VARIOUS 07/26/18 CONTRACT NUMBER 03-2H6804 LOCATIONS ALONG ROUTES 49 AND 193 CONTRACT CODE 'C ' LOCATION 03-ED-0999-VAR 13 CONTRACT ITEMS
TREE REMOVAL FEDERAL AID NONE
PAGE 1
BID OPENING DATE 07/25/18 IN LOS ANGELES, INGLEWOOD AND 07/26/18 CONTRACT NUMBER 07-296304 CULVER CITY, FROM I-105 TO PORT CONTRACT CODE 'B '
LOCATION 07-LA-405-R21.5/26.3 ROAD UNDERCROSSING 55 CONTRACT ITEMS
ROADWAY SAFETY IMPROVEMENT FEDERAL AID ACIM-405-3(056)E
This text is from one word file; I'll be looping my code on multiple doc files. In the text above are three location; contract items; contract code; federal aid pairs. But when I use regex to create variables, only the first instance of each pair is included.
The code I have right now is:
# imports
import os
import pandas as pd
import re
import docx2txt
import textract
import antiword
all_bod = []
all_cn = []
all_location = []
all_fedaid = []
all_contractcode = []
all_contractitems = []
all_file = []
text = ' PAGE 1
BID OPENING DATE 07/25/18 FROM 0.2 MILES WEST OF ICE HOUSE 07/26/18 CONTRACT NUMBER 03-2F1304 ROAD TO 0.015 MILES WEST OF CONTRACT CODE 'A '
LOCATION 03-ED-50-39.5/48.7 DIVISION HIGHWAY ROAD 44 CONTRACT ITEMS
INSTALL SANDTRAPS AND PULLOUTS FEDERAL AID ACNH-P050-(146)E
PAGE 1
BID OPENING DATE 07/25/18 IN EL DORADO COUNTY AT VARIOUS 07/26/18 CONTRACT NUMBER 03-2H6804 LOCATIONS ALONG ROUTES 49 AND 193 CONTRACT CODE 'C ' LOCATION 03-ED-0999-VAR 13 CONTRACT ITEMS
TREE REMOVAL FEDERAL AID NONE
PAGE 1
BID OPENING DATE 07/25/18 IN LOS ANGELES, INGLEWOOD AND 07/26/18 CONTRACT NUMBER 07-296304 CULVER CITY, FROM I-105 TO PORT CONTRACT CODE 'B '
LOCATION 07-LA-405-R21.5/26.3 ROAD UNDERCROSSING 55 CONTRACT ITEMS
ROADWAY SAFETY IMPROVEMENT FEDERAL AID ACIM-405-3(056)E'
bod1 = re.search('BID OPENING DATE \s+ (\d+\/\d+\/\d+)', text)
bod2 = re.search('BID OPENING DATE\n\n(\d+\/\d+\/\d+)', text)
if not(bod1 is None):
bod = bod1.group(1)
elif not(bod2 is None):
bod = bod2.group(1)
else:
bod = 'NA'
all_bod.append(bod)
# creating contract number
cn1 = re.search('CONTRACT NUMBER\n+(.*)', text)
cn2 = re.search('CONTRACT NUMBER\s+(.........)', text)
if not(cn1 is None):
cn = cn1.group(1)
elif not(cn2 is None):
cn = cn2.group(1)
else:
cn = 'NA'
all_cn.append(cn)
# location
location1 = re.search('LOCATION \s+\S+', text)
location2 = re.search('LOCATION \n+\S+', text)
if not(location1 is None):
location = location1.group(0)
elif not(location2 is None):
location = location2.group(0)
else:
location = 'NA'
all_location.append(location)
# federal aid
fedaid = re.search('FEDERAL AID\s+\S+', text)
fedaid = fedaid.group(0)
all_fedaid.append(fedaid)
# contract code
contractcode = re.search('CONTRACT CODE\s+\S+', text)
contractcode = contractcode.group(0)
all_contractcode.append(contractcode)
# contract items
contractitems = re.search('\d+ CONTRACT ITEMS', text)
contractitems = contractitems.group(0)
all_contractitems.append(contractitems)
This code parses the only first instance of these variables in the text.
contract-number
location
contract-items
contract-code
federal-aid
03-2F1304
03-ED-50-39.5/48.7
44
A
ACNH-P050-(146)E
But, I am trying to figure out a way to get all possible instances in different observations.
contract-number
location
contract-items
contract-code
federal-aid
03-2F1304
03-ED-50-39.5/48.7
44
A
ACNH-P050-(146)E
03-2H6804
03-ED-0999-VAR
13
C
NONE
07-296304
07-LA-405-R21.5/26.3
55
B
ACIM-405-3(056)E
The all_variables in the code are for looping over multiple word files - we can ignore that if we want :).
Any leads would be super helpful. Thanks so much!
import re
data = []
df = pd.DataFrame()
regex_contract_number =r"(?:CONTRACT NUMBER\s+(?P<contract_number>\S+?)\s)"
regex_location = r"(?:LOCATION\s+(?P<location>\S+))"
regex_contract_items = r"(?:(?P<contract_items>\d+)\sCONTRACT ITEMS)"
regex_federal_aid =r"(?:FEDERAL AID\s+(?P<federal_aid>\S+?)\s)"
regex_contract_code =r"(?:CONTRACT CODE\s+\'(?P<contract_code>\S+?)\s)"
regexes = [regex_contract_number,regex_location,regex_contract_items,regex_federal_aid,regex_contract_code]
for regex in regexes:
for match in re.finditer(regex, text):
data.append(match.groupdict())
df = pd.concat([df, pd.DataFrame(data)], axis=1)
data = []
df
I am trying to extract list of persons using Stanford Named Entity Recognizer (NER) in Python NLTK. Code and obtained output is like this
Code
from nltk.tag import StanfordNERTagger
st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')
sent = 'joel thompson tracy k smith new work world premierenew york philharmonic commission'
strin = sent.title()
value = st.tag(strin.split())
def get_continuous_chunks(tagged_sent):
continuous_chunk = []
current_chunk = []
for token, tag in tagged_sent:
if tag != "O":
current_chunk.append((token, tag))
else:
if current_chunk: # if the current chunk is not empty
continuous_chunk.append(current_chunk)
current_chunk = []
# Flush the final current_chunk into the continuous_chunk, if any.
if current_chunk:
continuous_chunk.append(current_chunk)
return continuous_chunk
named_entities = get_continuous_chunks(value)
named_entities_str = [" ".join([token for token, tag in ne]) for ne in named_entities]
print(named_entities_str)
Obtained Output
[('Joel Thompson Tracy K Smith New Work World Premierenew York Philharmonic Commission',
'PERSON')]
Desired Output
Person 1: Joel Thompson
Person 2: Tracy K Smith
Data : New Work World Premierenew York Philharmonic Commission
I am trying to create a webscraper for a website. The problem is that after the collected data is stored in a list, I'm not able to write this to a csv file properly. I have been stuck for ages with this problem and hopefully someone has an idea about how to fix this one!
The loop to get the data from the web pages:
import csv
from htmlrequest import simple_get
from htmlrequest import BeautifulSoup
# Define variables
listData = ['Companies', 'Locations', 'Descriptions']
plus = 15
max = 30
count = 0
# while loop to repeat process till max is reached
while (count <= max):
start = 'https://www.companiesintheuk.co.uk/find?q=Activities+of+sport+clubs&start=' + str(count) + '&s=h&t=SicCodeSearch&location=&sicCode=93120'
raw_html = simple_get(start)
soup = BeautifulSoup(raw_html, 'html.parser')
for i, div in enumerate(soup.find_all('div', class_="search_result_title")):
listData[0] = listData[0].strip() + div.text
for i, div2 in enumerate(soup.find_all('div', class_="searchAddress")):
listData[1] = listData[1].strip() + div2.text
# This is extra information
# for i, div3 in enumerate(soup.find_all('div', class_="searchSicCode")):
# listData[2] = listData[2].strip() + div3.text
count = count + plus
output example if printed:
Companies
(AMG) AGILITY MANAGEMENT GROUP LTD
(KLA) LIONS/LIONESS FOOTBALL TEAMS WORLD CUP LTD
(Dissolved)
1 SPORT ORGANISATION LIMITED
100UK LTD
1066 GYMNASTICS
1066 SPECIALS
10COACHING LIMITED
147 LOUNGE LTD
147 SNOOKER AND POOL CLUB (LEICESTER) LIMITED
Locations
ENGLAND, BH8 9PS
LONDON, EC2M 2PL
ENGLAND, LS7 3JB
ENGLAND, LE2 8FN
UNITED KINGDOM, N18 2QX
AVON, BS5 0JH
UNITED KINGDOM, WC2H 9JQ
UNITED KINGDOM, SE18 5SZ
UNITED KINGDOM, EC1V 2NX
I've tried to get it into a CSV file by using this code but I can't figure out how to properly format my output! Any suggestions are welcome.
# writing to csv
with open('test.csv', 'w') as csvfile:
write = csv.writer(csvfile, delimiter=',')
write.writerow(['Name','Location'])
write.writerow([listData[0],listData[1]])
print("Writing has been done!")
I want the code to be able to format it properly in the csv file to be able to import the two rows in a database.
This is the output when I write the data on 'test.csv'
which will result into this when opened up
The expected outcome would be something like this!
I'm not sure how it is improperly formatted, but maybe you just need to replace with open('test.csv', 'w') with with open('test.csv', 'w+', newline='')
I've combined your code (taking out htmlrequests for requests and bs4 modules and also not using listData, but instead creating my own lists. I've left your lists but they do nothing):
import csv
import bs4
import requests
# Define variables
listData = ['Companies', 'Locations', 'Descriptions']
company_list = []
locations_list = []
plus = 15
max = 30
count = 0
# while loop to repeat process till max is reached
while count <= max:
start = 'https://www.companiesintheuk.co.uk/find?q=Activities+of+sport+clubs&start={}&s=h&t=SicCodeSearch&location=&sicCode=93120'.format(count)
res = requests.get(start)
soup = bs4.BeautifulSoup(res.text, 'html.parser')
for i, div in enumerate(soup.find_all('div', class_="search_result_title")):
listData[0] = listData[0].strip() + div.text
company_list.append(div.text.strip())
for i, div2 in enumerate(soup.find_all('div', class_="searchAddress")):
listData[1] = listData[1].strip() + div2.text
locations_list.append(div2.text.strip())
# This is extra information
# for i, div3 in enumerate(soup.find_all('div', class_="searchSicCode")):
# listData[2] = listData[2].strip() + div3.text
count = count + plus
if len(company_list) == len(locations_list):
with open('test.csv', 'w+', newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=',')
writer.writerow(['Name', 'Location'])
for i in range(len(company_list)):
writer.writerow([company_list[i], locations_list[i]])
Which generates a csv file like:
Name,Location
(AMG) AGILITY MANAGEMENT GROUP LTD,"UNITED KINGDOM, M6 6DE"
"(KLA) LIONS/LIONESS FOOTBALL TEAMS WORLD CUP LTD
(Dissolved)","ENGLAND, BD1 2PX"
0161 STUDIOS LTD,"UNITED KINGDOM, HD6 3AX"
1 CLICK SPORTS MANAGEMENT LIMITED,"ENGLAND, E10 5PW"
1 SPORT ORGANISATION LIMITED,"UNITED KINGDOM, CR2 6NF"
100UK LTD,"UNITED KINGDOM, BN14 9EJ"
1066 GYMNASTICS,"EAST SUSSEX, BN21 4PT"
1066 SPECIALS,"EAST SUSSEX, TN40 1HE"
10COACHING LIMITED,"UNITED KINGDOM, SW6 6LR"
10IS ACADEMY LIMITED,"ENGLAND, PE15 9PS"
"10TH MAN LIMITED
(Dissolved)","GLASGOW, G3 6AN"
12 GAUGE EAST MANCHESTER COMMUNITY MMA LTD,"ENGLAND, OL9 8DQ"
121 MAKING WAVES LIMITED,"TYNE AND WEAR, NE30 1AR"
121 WAVES LTD,"TYNE AND WEAR, NE30 1AR"
1-2-KICK LTD,"ENGLAND, BH8 9PS"
"147 HAVANA LIMITED
(Liquidation)","LONDON, EC2M 2PL"
147 LOUNGE LTD,"ENGLAND, LS7 3JB"
147 SNOOKER AND POOL CLUB (LEICESTER) LIMITED,"ENGLAND, LE2 8FN"
1ACTIVE LTD,"UNITED KINGDOM, N18 2QX"
1ON1 KING LTD,"AVON, BS5 0JH"
1PUTT LTD,"UNITED KINGDOM, WC2H 9JQ"
1ST SPORTS LTD,"UNITED KINGDOM, SE18 5SZ"
2 BRO PRO EVENTS LTD,"UNITED KINGDOM, EC1V 2NX"
2 SPLASH SWIM SCHOOL LTD,"ENGLAND, B36 0EY"
2 STEPPERS C.I.C.,"SURREY, CR0 6BX"
2017 MOTO LIMITED,"UNITED KINGDOM, ME2 4NW"
2020 ARCHERY LTD,"LONDON, SE16 6SS"
21 LEISURE LIMITED,"LONDON, EC4M 7WS"
261 FEARLESS CLUB UNITED KINGDOM CIC,"LANCASHIRE, LA2 8RF"
2AIM4 LIMITED,"HERTFORDSHIRE, SG2 0JD"
2POINT4 FM LTD,"LONDON, NW10 8LW"
3 LIONS SCHOOL OF SPORT LTD,"BRISTOL, BS20 8BU"
3 PT LTD,"ANTRIM, BT40 2FB"
3 PUTT LIFE LTD,"UNITED KINGDOM, LU3 2DP"
3 THIRTY SEVEN LTD,"KENT, DA9 9RS"
3:30 SOCCER SCHOOL LTD,"UNITED KINGDOM, EH6 7JB"
30 MINUTE WORKOUT (LLANISHEN) LTD,"PONTYCLUN, CF72 9UA"
321 RELAX LTD,"MID GLAMORGAN, CF83 3HL"
360 MOTOR RACING CLUB LTD,"HALSTEAD, CO9 2ET"
3LIONSATHLETICS LIMITED,"ENGLAND, S3 8DB"
3S SWIM ROMFORD LTD,"UNITED KINGDOM, DA9 9DR"
3XL EVENT MANAGEMENT LIMITED,"KENT, BR3 4NW"
3XL MOTORSPORT MANAGEMENT LIMITED,"KENT, BR3 4NW"
4 CORNER FOOTBALL LTD,"BROMLEY, BR1 5DD"
4 PRO LTD,"UNITED KINGDOM, FY5 5HT"
Which seems fine to me, but your post was very unclear about how you expected it to be formatted so I really have no idea
I am supposed to get certain information from a .txt file and output it. This is the information I need:
State with the maximum population
State with the minimum population
Average state population
State of Texas population
The DATA looks like:
Alabama
AL
4802982
Alaska
AK
721523
Arizona
AZ
6412700
Arkansas
AR
2926229
California
CA
37341989
This is my code that does not really do anything I need it to do:
def main():
# Open the StateCensus2010.txt file.
census_file = open('StateCensus2010.txt', 'r')
# Read the state name
state_name = census_file.readline()
while state_name != '':
state_abv = census_file.readline()
population = int(census_file.readline())
state_name = state_name.rstrip('\n')
state_abv = state_abv.rstrip('\n')
print('State Name: ', state_name)
print('State Abv.: ', state_abv)
print('Population: ', population)
print()
state_name = census_file.readline()
census_file.close()
main()
All I have it doing is reading the state name, abv and converting the population into an int. I don't need it to do anything of that, however I'm unsure how to do what the assignment is asking. Any hints would definitely be appreciated! I've been trying some things for the past few hours to no avail.
Update:
This is my updated code however I'm receving the following error:
Traceback (most recent call last):
File "main.py", line 13, in <module>
if population > max_population:
TypeError: unorderable types: str() > int()
Code:
with open('StateCensus2010.txt', 'r') as census_file:
while True:
try:
state_name = census_file.readline()
state_abv = census_file.readline()
population = int(census_file.readline())
except IOError:
break
# data processing here
max_population = 0
for population in census_file:
if population > max_population:
max_population = population
print(max_population)
As the data is in consistent order; Statename, State Abv, Population. So you just need to read the lines one time, and display all three 3 information. Below is the sample code.
average = 0.0
total = 0.0
state_min = 999999999999
state_max = 0
statename_min = ''
statename_max = ''
texas_population = 0
with open('StateCensus2010.txt','r') as file:
# split new line, '\n' here means newline
data = file.read().split('\n')
# get the length of the data by using len() method
# there are 50 states in the text file
# each states have 3 information stored,
# state name, state abreviation, population
# that's why length of data which is 150/3 = 50 states
state_total = len(data)/3
# this count is used as an index for the list
count = 0
for i in range(int(state_total)):
statename = data[count]
state_abv = data[count+1]
population = int(data[count+2])
print('Statename : ',statename)
print('State Abv : ',state_abv)
print('Population: ',population)
print()
# sum all states population
total += population
if population > state_max:
state_max = population
statename_max = statename
if population < state_min:
state_min = population
statename_min = statename
if statename == 'Texas':
texas_population = population
# add 3 because we want to jump to next state
# for example the first three lines is Alabama info
# the next three lines is Alaska info and so on
count += 3
# divide the total population with number of states
average = total/state_total
print(str(average))
print('Lowest population state :', statename_min)
print('Highest population state :', statename_max)
print('Texas population :', texas_population)
This problem is pretty easy using pandas.
Code:
states = []
for line in data:
states.append(
dict(state=line.strip(),
abbrev=next(data).strip(),
pop=int(next(data)),
)
)
df = pd.DataFrame(states)
print(df)
print('\nmax population:\n', df.ix[df['pop'].idxmax()])
print('\nmin population:\n', df.ix[df['pop'].idxmin()])
print('\navg population:\n', df['pop'].mean())
print('\nAZ population:\n', df[df.abbrev == 'AZ'])
Test Data:
from io import StringIO
data = StringIO(u'\n'.join([x.strip() for x in """
Alabama
AL
4802982
Alaska
AK
721523
Arizona
AZ
6412700
Arkansas
AR
2926229
California
CA
37341989
""".split('\n')[1:-1]]))
Results:
abbrev pop state
0 AL 4802982 Alabama
1 AK 721523 Alaska
2 AZ 6412700 Arizona
3 AR 2926229 Arkansas
4 CA 37341989 California
max population:
abbrev CA
pop 37341989
state California
Name: 4, dtype: object
min population:
abbrev AK
pop 721523
state Alaska
Name: 1, dtype: object
avg population:
10441084.6
AZ population:
abbrev pop state
2 AZ 6412700 Arizona
Another pandas solution, from the interpreter:
>>> import pandas as pd
>>>
>>> records = [line.strip() for line in open('./your.txt', 'r')]
>>>
>>> df = pd.DataFrame([records[i:i+3] for i in range(0, len(records), 3)],
... columns=['State', 'Code', 'Pop']).dropna()
>>>
>>> df['Pop'] = df['Pop'].astype(int)
>>>
>>> df
State Code Pop
0 Alabama AL 4802982
1 Alaska AK 721523
2 Arizona AZ 6412700
3 Arkansas AR 2926229
4 California CA 37341989
>>>
>>> df.ix[df['Pop'].idxmax()]
State California
Code CA
Pop 37341989
Name: 4, dtype: object
>>>
>>> df.ix[df['Pop'].idxmin()]
State Alaska
Code AK
Pop 721523
Name: 1, dtype: object
>>>
>>> df['Pop'].mean()
10441084.6
>>>
>>> df.ix[df['Code'] == 'AZ' ]
State Code Pop
2 Arizona AZ 6412700
Please try this the earlier code was not python 3 compatible. It supported python 2.7
def extract_data(state):
total_population = 0
for states, stats in state.items():
population = stats.get('population')
state_name = stats.get('state_name')
states = states
total_population = population + total_population
if 'highest' not in vars():
highest = population
higherst_state_name = state_name
highest_state = states
if 'lowest' not in vars():
lowest = population
lowest_state_name = state_name
lowest_state = states
if highest < population:
highest = population
higherst_state_name = state_name
highest_state = states
if lowest > population:
lowest = population
lowest_state_name = state_name
lowest_state = states
print(highest_state, highest)
print(lowest_state, lowest)
print(len(state))
print(int(total_population/len(state)))
print(state.get('TX').get('population'))
def main():
# Open the StateCensus2010.txt file.
census_file = open('states.txt', 'r')
# Read the state name
state_name = census_file.readline()
state = {}
while state_name != '':
state_abv = census_file.readline()
population = int(census_file.readline())
state_name = state_name.rstrip('\n')
state_abv = state_abv.rstrip('\n')
if state_abv in state:
state[state_abv].update({'population': population, 'state_name': state_name})
else:
state.setdefault(state_abv,{'population': population, 'state_name': state_name})
state_name = census_file.readline()
census_file.close()
return state
state=main()
extract_data(state)
URL: http://www.imdb.com/chart/?ref_=nv_ch_cht_2
I want you to print top box office list from above site (all the movies' rank, title, weekend, gross and weeks movies in the order)
Example output:
Rank:1
title: godzilla
weekend:$93.2M
Gross:$93.2M
Weeks: 1
Rank: 2
title: Neighbours
This is just a simple way to extract those entities by BeautifulSoup
from bs4 import BeautifulSoup
import urllib2
url = "http://www.imdb.com/chart/?ref_=nv_ch_cht_2"
data = urllib2.urlopen(url).read()
page = BeautifulSoup(data, 'html.parser')
rows = page.findAll("tr", {'class': ['odd', 'even']})
for tr in rows:
for data in tr.findAll("td", {'class': ['titleColumn', 'weeksColumn','ratingColumn']}):
print data.get_text()
P.S.-Arrange according to your will.
There is no need to scrape anything. See the answer I gave here.
How to scrape data from imdb business page?
The below Python script will give you, 1) List of Top Box Office movies from IMDb 2) And also the List of Cast for each of them.
from lxml.html import parse
def imdb_bo(no_of_movies=5):
bo_url = 'http://www.imdb.com/chart/'
bo_page = parse(bo_url).getroot()
bo_table = bo_page.cssselect('table.chart')
bo_total = len(bo_table[0][2])
if no_of_movies <= bo_total:
count = no_of_movies
else:
count = bo_total
movies = {}
for i in range(0, count):
mo = {}
mo['url'] = 'http://www.imdb.com'+bo_page.cssselect('td.titleColumn')[i][0].get('href')
mo['title'] = bo_page.cssselect('td.titleColumn')[i][0].text_content().strip()
mo['year'] = bo_page.cssselect('td.titleColumn')[i][1].text_content().strip(" ()")
mo['weekend'] = bo_page.cssselect('td.ratingColumn')[i*2].text_content().strip()
mo['gross'] = bo_page.cssselect('td.ratingColumn')[(i*2)+1][0].text_content().strip()
mo['weeks'] = bo_page.cssselect('td.weeksColumn')[i].text_content().strip()
m_page = parse(mo['url']).getroot()
m_casttable = m_page.cssselect('table.cast_list')
flag = 0
mo['cast'] = []
for cast in m_casttable[0]:
if flag == 0:
flag = 1
else:
m_starname = cast[1][0][0].text_content().strip()
mo['cast'].append(m_starname)
movies[i] = mo
return movies
if __name__ == '__main__':
no_of_movies = raw_input("Enter no. of Box office movies to display:")
bo_movies = imdb_bo(int(no_of_movies))
for k,v in bo_movies.iteritems():
print '#'+str(k+1)+' '+v['title']+' ('+v['year']+')'
print 'URL: '+v['url']
print 'Weekend: '+v['weekend']
print 'Gross: '+v['gross']
print 'Weeks: '+v['weeks']
print 'Cast: '+', '.join(v['cast'])
print '\n'
Output (run in terminal):
parag#parag-innovate:~/python$ python imdb_bo_scraper.py
Enter no. of Box office movies to display:3
#1 Cinderella (2015)
URL: http://www.imdb.com/title/tt1661199?ref_=cht_bo_1
Weekend: $67.88M
Gross: $67.88M
Weeks: 1
Cast: Cate Blanchett, Lily James, Richard Madden, Helena Bonham Carter, Nonso Anozie, Stellan SkarsgÄrd, Sophie McShera, Holliday Grainger, Derek Jacobi, Ben Chaplin, Hayley Atwell, Rob Brydon, Jana Perez, Alex Macqueen, Tom Edden
#2 Run All Night (2015)
URL: http://www.imdb.com/title/tt2199571?ref_=cht_bo_2
Weekend: $11.01M
Gross: $11.01M
Weeks: 1
Cast: Liam Neeson, Ed Harris, Joel Kinnaman, Boyd Holbrook, Bruce McGill, Genesis Rodriguez, Vincent D'Onofrio, Lois Smith, Common, Beau Knapp, Patricia Kalember, Daniel Stewart Sherman, James Martinez, Radivoje Bukvic, Tony Naumovski
#3 Kingsman: The Secret Service (2014)
URL: http://www.imdb.com/title/tt2802144?ref_=cht_bo_3
Weekend: $6.21M
Gross: $107.39M
Weeks: 5
Cast: Adrian Quinton, Colin Firth, Mark Strong, Jonno Davies, Jack Davenport, Alex Nikolov, Samantha Womack, Mark Hamill, Velibor Topic, Sofia Boutella, Samuel L. Jackson, Michael Caine, Taron Egerton, Geoff Bell, Jordan Long