I am trying to extract audio features from Spotify using track URIs. I have a list of 500k and would like to extract audio features for all. I have a workable code below and can extract features of 80 songs. I need some help in modifying the code below to extract 80 at a time so I don't run afoul of the Spotify limit. An example of the list is below
['spotify:track:2d7LPtieXdIYzf7yHPooWd',
'spotify:track:0y4TKcc7p2H6P0GJlt01EI',
'spotify:track:6q4c1vPRZREh7nw3wG7Ixz',
'spotify:track:54KFQB6N4pn926IUUYZGzK',
'spotify:track:0NeJjNlprGfZpeX2LQuN6c']
client_id = 'xxx'
client_secret = 'xxx'
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
def get_audio_features(saved_uris):
artist = []
track = []
danceability = []
energy = []
key = []
loudness = []
mode = []
speechiness = []
acousticness = []
instrumentalness = []
liveness = []
valence = []
tempo = []
duration_ms = []
for uri in saved_uris:
x = sp.audio_features(uri)
y = sp.track(uri)
for audio_features in x:
danceability.append(audio_features['danceability'])
energy.append(audio_features['energy'])
key.append(audio_features['key'])
loudness.append(audio_features['loudness'])
mode.append(audio_features['mode'])
speechiness.append(audio_features['speechiness'])
acousticness.append(audio_features['acousticness'])
instrumentalness.append(audio_features['instrumentalness'])
liveness.append(audio_features['liveness'])
valence.append(audio_features['valence'])
tempo.append(audio_features['tempo'])
duration_ms.append(audio_features['duration_ms'])
artist.append(y['album']['artists'][0]['name'])
track.append(y['name'])
df = pd.DataFrame()
df['artist'] = artist
df['track'] = track
df['danceability'] = danceability
df['energy'] = energy
df['key'] = key
df['loudness'] = loudness
df['mode'] = mode
df['speechiness'] = speechiness
df['acousticness'] = acousticness
df['instrumentalness'] = instrumentalness
df['liveness'] = liveness
df['valence'] = valence
df['tempo'] = tempo
df['duration_ms'] = duration_ms
df.to_csv('data/xxx.csv')
return df
My output is a dataframe and it looks like this and I have cut some columns for readibility:
artist track danceability energy key loudness
Sleeping At Last Chasing Cars 0.467 0.157 11
This code will return you dataframe that you require.
import spotipy
import time
from spotipy.oauth2 import SpotifyClientCredentials #To access authorised Spotify data4
import pandas as pd
client_id = 'paste client_id here'
client_secret = 'paste client_secret here'
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
sp.trace=False
#your uri list goes here
s_list = ['spotify:track:2d7LPtieXdIYzf7yHPooWd','spotify:track:0y4TKcc7p2H6P0GJlt01EI','spotify:track:6q4c1vPRZREh7nw3wG7Ixz','spotify:track:54KFQB6N4pn926IUUYZGzK','spotify:track:0NeJjNlprGfZpeX2LQuN6c']
#put uri to dataframe
df = pd.DataFrame(s_list)
df.columns = ['URI']
df['energy'] = ''*df.shape[0]
df['loudness'] = ''*df.shape[0]
df['speechiness'] = ''*df.shape[0]
df['valence'] = ''*df.shape[0]
df['liveness'] = ''*df.shape[0]
df['tempo'] = ''*df.shape[0]
df['danceability'] = ''*df.shape[0]
for i in range(0,df.shape[0]):
time.sleep(random.uniform(3, 6))
URI = df.URI[i]
features = sp.audio_features(URI)
df.loc[i,'energy'] = features[0]['energy']
df.loc[i,'speechiness'] = features[0]['speechiness']
df.loc[i,'liveness'] = features[0]['liveness']
df.loc[i,'loudness'] = features[0]['loudness']
df.loc[i,'danceability'] = features[0]['danceability']
df.loc[i,'tempo'] = features[0]['tempo']
df.loc[i,'valence'] = features[0]['valence']
uri=0
Output:
Hope, this solves your problem.
Related
I am trying to split up a json file from alpha-vantages api into separate files depending on the date. I'm also trying to reformat the file to have blank values in the gaps where dates are missing. The following code is what I have come up with but it gives me the TypeError: 'list' object is not callable". I'm fairly new to python and pandas so I'm sure there is a better way to go about this.
import requests
import pandas as pd
from datetime import datetime, timedelta
from dateutil import parser
import numpy as np
from pandas import DataFrame
import json
symbol = "MSFT"
symbol_list = symbol.split(",")
def num_el(list):
count = 0
for element in list:
count += 1
return count
def csv_make(sy, dar, dat):
csv_file = open(f"{sy}_1min_{dar}.csv", "w", newline="")
csv_file.write(dat)
csv_file.close()
i = 0
x = -1
n = num_el(symbol_list)
while i < n:
namesym = symbol_list[x]
ticker = namesym
api_key = 'APIKEYHERE'
url = f'https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY&symbol={ticker}&outputsize=full&interval=1min&apikey={api_key}'
data = requests.get(url)
dsf = data.json()
daf = pd.DataFrame(dsf['Time Series (1min)'])
dxf: DataFrame = daf.T
dxf.index.name = 'time'
dxf.reset_index(inplace=True)
dxf['time'] = pd.to_datetime(dxf['time'])
dxf['minute'] = dxf['time'].dt.time
dxf['day'] = dxf['time'].dt.day
dxf['date'] = dxf['time'].dt.date
agg = dxf.groupby([dxf['day']])
length1 = dxf.groupby([dxf['day']]).size()
length = pd.DataFrame(length1)
length.index.name = 'day'
length.reset_index(inplace=True)
length_sum = length[0].sum()
v = 0
d = length_sum
b = len(length)
x2 = length_sum
while v < b:
a = length[0][v]
x2 -= length[0][v]
xd = agg.get_group(length['day'][v])
date = xd['date'][x2]
max_dt = parser.parse(str(max(xd['minute'])))
min_dt = parser.parse(str(min(xd['minute'])))
dt_range = []
while min_dt <= max_dt:
dt_range.append(min_dt.strftime("%H:%M:%S"))
min_dt += timedelta(seconds=60)
complete_df = pd.DataFrame({'minute': dt_range})
xy = complete_df.astype('str')
yx = xd.astype('str')
dasf = xy.merge(yx, how='left', on='minute')
dasf['ev'] = np.where(dasf['1. open'].notnull(), 'False', 'True')
time = []
open = []
high = []
low = []
close = []
volume = []
empty_value = []
for ib in range(len(dasf)):
time.append(dasf['minute'][ib])
open.append(dasf['1. open'][ib])
high.append(dasf['2. high'][ib])
low.append(dasf['3. low'][ib])
close.append(dasf['4. close'][ib])
volume.append(dasf['5. volume'][ib])
empty_value.append(dasf['ev'][ib])
time_df = pd.DataFrame(time).rename(columns={0: 'Time'})
open_df = pd.DataFrame(open).rename(columns={0: 'Open'})
high_df = pd.DataFrame(high).rename(columns={0: 'High'})
low_df = pd.DataFrame(low).rename(columns={0: 'Low'})
close_df = pd.DataFrame(close).rename(columns={0: 'Close'})
volume_df = pd.DataFrame(volume).rename(columns={0: 'Volume'})
empty_value_df = pd.DataFrame(empty_value).rename(columns={0: 'Empty Value'})
frames = [time_df, open_df, high_df, low_df, close_df, volume_df, empty_value_df]
df = pd.concat(frames, axis=1, join='inner')
df = df.set_index('Time')
ad = df.to_csv()
csv_make(namesym, date, ad)
v += 1
i += 1
Working on a project to scrape billboard top 100 over multiple weeks, look up song audio features using Spotify's API, and save the info in a new pandas df.
I got this to work for up to 100 searches at a time (the spotify api only allows 100 ids), but I am having trouble writing code for iterating through the song ids 100 at a time, running the api, and saving into a new df.
Below is the working code for 100 id searches at a time:
df_import = pd.read_csv(r'xxx/Billboard_Top_100.csv')
track_id_list = []
artist_name_list = []
track_name_list = []
for item, row in df_import.head(100).iterrows():
artist = row['Artist']
track = row['Song']
try:
spotify_response = sp.search(q='artist:' + artist + ' track:' + track, type='track')
#artist name
artist_name = spotify_response['tracks']['items'][0]['artists'][0]['name']
#song name
track_name = spotify_response['tracks']['items'][0]['name']
#unique sportify track id used for audio feautre search
track_id = spotify_response['tracks']['items'][0]['uri']
#splits string to search for features
track_id_split = str.split(track_id, 'spotify:track:')
track_id_list.append(track_id_split[1])
artist_name_list.append(row['Artist'])
track_name_list.append(row['Song'])
except:
DNF_song_search = sp.search(q=track)
artist_name = DNF_song_search['tracks']['items'][0]['artists'][0]['name']
if search(artist_name, artist):
#song name
track_name = DNF_song_search['tracks']['items'][0]['name']
#unique sportify track id used for audio feautre search
track_id = DNF_song_search['tracks']['items'][0]['uri']
#splits string to search for features
track_id_split = str.split(track_id, 'spotify:track:')
track_id_list.append(track_id_split[1])
artist_name_list.append(row['Artist'])
track_name_list.append(row['Song'])
else:
print('Inconsistent artist match on: ' + artist + ' ' + artist_name + ' for song ' + track)
#spotify api to save song features based on track ids
features = sp.audio_features(track_id_list)
#save features list into pandas df
features_df = pd.DataFrame(data = features)
#add artist and song columns from imported billboard df
features_df['Artist'] = artist_name_list
features_df['Song'] = track_name_list
#combine the two dataframes
df_merged = pd.merge(df_import, features_df, on = 'Song', how = 'left')
df_merged.to_csv('merged.csv')
I have tried saving all of the songs ids into a list, and then executing the api 100 ids at a time, but I get various errors when I try to save into a new dataframe.
solved myself
track_id_list = []
artist_name_list = []
track_name_list = []
for n in range(len(df_import) // 100):
for r in range(99):
artist = df_import.iloc[r+(n*100),3]
track = df_import.iloc[r+(n*100),4]
try:
spotify_response = sp.search(q='artist:' + artist + ' track:' + track, type='track')
artist_name = spotify_response['tracks']['items'][0]['artists'][0]['name']
track_name = spotify_response['tracks']['items'][0]['name']
#unique spotify track id used for audio feature search
track_id = spotify_response['tracks']['items'][0]['uri']
#splits string to search for features
track_id_split = str.split(track_id, 'spotify:track:')
track_id_list.append(track_id_split[1])
artist_name_list.append(artist)
track_name_list.append(track)
except:
DNF_song_search = sp.search(q=track)
artist_name = DNF_song_search['tracks']['items'][0]['artists'][0]['name']
if search(artist_name, artist):
track_name = DNF_song_search['tracks']['items'][0]['name']
track_id = DNF_song_search['tracks']['items'][0]['uri']
track_id_split = str.split(track_id, 'spotify:track:')
track_id_list.append(track_id_split[1])
artist_name_list.append(artist)
track_name_list.append(track)
else:
print('Inconsistent artist match on: ' + artist + ' ' + artist_name + ' for song ' + track)
features_df = pd.DataFrame()
for num in range(len(track_id_list) // 100 + 1):
features = sp.audio_features(track_id_list[(num*100):(num+1)*100])
features_df = features_df.append(pd.DataFrame(features))
#add artist and song columns from imported billboard df
features_df['Artist'] = artist_name_list
features_df['Song'] = track_name_list
#combine the two dataframes
df_merged = pd.merge(df_import, features_df.drop_duplicates(), on = 'Song', how = 'left')
df_merged.to_csv('mergedv2.csv')
When trying to scrape multiple pages of this website, I get no content in return. I usually check to make sure all the lists I'm creating are of equal length, but all are coming back as len = 0.
I've used similar code to scrape other websites, so why does this code not work correctly?
Some solutions I've tried, but haven't worked for my purposes: requests.Session() solutions as suggested in this answer, .json as suggested here.
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
from random import randint
from googletrans import Translator
translator = Translator()
rg = []
ctr_n = []
ctr = []
yr = []
mn = []
sub = []
cst_n = []
cst = []
mag = []
pty_n = []
pty = []
can = []
pev1 = []
vot1 = []
vv1 = []
ivv1 = []
to1 = []
cv1 = []
cvs1 = []
pv1 = []
pvs1 = []
pev2 = []
vot2 = []
vv2 = []
ivv2 = []
to2 = []
cv2 = []
cvs2 =[]
pv2 = []
pvs2 = []
seat = []
no_info = []
manual = []
START_PAGE = 1
END_PAGE = 42
for page in range(START_PAGE, END_PAGE + 1):
page = requests.get("https://sejmsenat2019.pkw.gov.pl/sejmsenat2019/en/wyniki/sejm/okr/" + str(page))
page.encoding = page.apparent_encoding
if not page:
pass
else:
soup = BeautifulSoup(page.text, 'html.parser')
tbody = soup.find_all('table', class_='table table-borderd table-striped table-hover dataTable no-footer clickable right2 right4')
sleep(randint(2,10))
for container in tbody:
col1 = container.find_all('tr', {'data-id':'26079'})
for info in col1:
col_1 = info.find_all('td')
for data in col_1:
party = data[0]
party_trans = translator.translate(party)
pty_n.append(party_trans)
pvotes = data[1]
pv1.append(pvotes)
pshare = data[2]
pvs1.append(pshare)
mandates = data[3]
seat.append(mandates)
col2 = container.find_all('tr', {'data-id':'26075'})
for info in col2:
col_2 = info.find_all('td')
for data in col_2:
party2 = data[0]
party_trans2 = translator.translate(party2)
pty_n.append(party_trans2)
pvotes2 = data[1]
pv1.append(pvotes2)
pshare2 = data[2]
pvs1.append(pshare2)
mandates2 = data[3]
seat.append(mandates2)
col3 = container.find_all('tr', {'data-id':'26063'})
for info in col3:
col_3 = info.find_all('td')
for data in col_3:
party3 = data[0].text
party_trans3 = translator.translate(party3)
pty_n.extend(party_trans3)
pvotes3 = data[1].text
pv1.extend(pvotes3)
pshare3 = data[2].text
pvs1.extend(pshare3)
mandates3 = data[3].text
seat.extend(mandates3)
col4 = container.find_all('tr', {'data-id':'26091'})
for info in col4:
col_4 = info.find_all('td',recursive=True)
for data in col_4:
party4 = data[0]
party_trans4 = translator.translate(party4)
pty_n.extend(party_trans4)
pvotes4 = data[1]
pv1.extend(pvotes4)
pshare4 = data[2]
pvs1.extend(pshare4)
mandates4 = data[3]
seat.extend(mandates4)
col5 = container.find_all('tr', {'data-id':'26073'})
for info in col5:
col_5 = info.find_all('td')
for data in col_5:
party5 = data[0]
party_trans5 = translator.translate(party5)
pty_n.extend(party_trans5)
pvotes5 = data[1]
pv1.extend(pvotes5)
pshare5 = data[2]
pvs1.extend(pshare5)
mandates5 = data[3]
seat.extend(mandates5)
col6 = container.find_all('tr', {'data-id':'26080'})
for info in col6:
col_6 = info.find_all('td')
for data in col_6:
party6 = data[0]
party_trans6 = translator.translate(party6)
pty_n.extend(party_trans6)
pvotes6 = data[1]
pv1.extend(pvotes6)
pshare6 = data[2]
pvs1.extend(pshare6)
mandates6 = data[3]
seat.extend(mandates6)
#### TOTAL VOTES ####
tfoot = soup.find_all('tfoot')
for data in tfoot:
fvote = data.find_all('td')
for info in fvote:
votefinal = info.find(text=True).get_text()
fvoteindiv = [votefinal]
fvotelist = fvoteindiv * (len(pty_n) - len(vot1))
vot1.extend(fvotelist)
#### CONSTITUENCY NAMES ####
constit = soup.find_all('a', class_='btn btn-link last')
for data in constit:
names = data.get_text()
names_clean = names.replace("Sejum Constituency no.","")
names_clean2 = names_clean.replace("[","")
names_clean3 = names_clean2.replace("]","")
namesfinal = names_clean3.split()[1]
constitindiv = [namesfinal]
constitlist = constitindiv * (len(pty_n) - len(cst_n))
cst_n.extend(constitlist)
#### UNSCRAPABLE INFO ####
region = 'Europe'
reg2 = [region]
reglist = reg2 * (len(pty_n) - len(rg))
rg.extend(reglist)
country = 'Poland'
ctr2 = [country]
ctrlist = ctr2 * (len(pty_n) - len(ctr_n))
ctr_n.extend(ctrlist)
year = '2019'
yr2 = [year]
yrlist = yr2 * (len(pty_n) - len(yr))
yr.extend(yrlist)
month = '10'
mo2 = [month]
molist = mo2 * (len(pty_n) - len(mn))
mn.extend(molist)
codes = ''
codes2 = [codes]
codeslist = codes2 * (len(pty_n) - len(manual))
manual.extend(codeslist)
noinfo = '-990'
noinfo2 = [noinfo]
noinfolist = noinfo2 * (len(pty_n) - len(no_info))
no_info.extend(noinfolist)
print(len(rg), len(pty_n), len(pv1), len(pvs1), len(no_info), len(vot1), len(cst_n))
poland19 = pd.DataFrame({
'rg' : rg,
'ctr_n' : ctr_n,
'ctr': manual,
'yr' : yr,
'mn' : mn,
'sub' : manual,
'cst_n': cst_n,
'cst' : manual,
'mag': manual,
'pty_n': pty_n,
'pty': manual,
'can': can,
'pev1': no_info,
'vot1': vot1,
'vv1': vot1,
'ivv1': no_info,
'to1': no_info,
'cv1': no_info,
'cvs1': no_info,
'pv1': cv1,
'pvs1': cvs1,
'pev2': no_info,
'vot2': no_info,
'vv2': no_info,
'ivv2': no_info,
'to2': no_info,
'cv2': no_info,
'cvs2': no_info,
'pv2' : no_info,
'pvs2' : no_info,
'seat' : manual
})
print(poland19)
poland19.to_csv('poland_19.csv')
As commented you probably need to use Selenium. You could replace the requests lib and replace the request statements with sth like this:
from selenium import webdriver
wd = webdriver.Chrome('pathToChromeDriver') # or any other Browser driver
wd.get(url) # instead of requests.get()
soup = BeautifulSoup(wd.page_source, 'html.parser')
You need to follow the instructions to install and implement the selenium lib at this link: https://selenium-python.readthedocs.io/
Note: I tested your code with selenium and I was able to get the table that you were looking for, but with the class_=... does not work for some reason.
Instead browsing at the scraped data I found that it has an attribute id. So maybe try also this instead:
tbody = soup.find_all('table', id="DataTables_Table_0")
And again, by doing the get requests with the selenium lib.
Hope that was helpful :)
Cheers
I'm working on a Sentiment Analysis project using Twitter Data, and I've encountered a small problem regarding Dates. The code itself runs fine, but I don't know how to build custom time blocks for grouping my final data. Right now, it is defaulting to grouping them by the second, which is not very useful. I want to be able to group them in half-hour, hour, and day segments...
Feel free to skip to the bottom of the code to see where the issue lies!
Here is the code:
import tweepy
API_KEY = "XXXXX"
API_SECRET = XXXXXX"
auth = tweepy.AppAuthHandler(API_KEY, API_SECRET)
api = tweepy.API(auth, wait_on_rate_limit = True, wait_on_rate_limit_notify = True)
import sklearn as sk
import pandas as pd
import got3
#"Get Old Tweets" to find older data
tweetCriteria = got3.manager.TweetCriteria()
tweetCriteria.setQuerySearch("Kentucky Derby")
tweetCriteria.setSince("2016-05-07")
tweetCriteria.setUntil("2016-05-08")
tweetCriteria.setMaxTweets(1000)
TweetCriteria = got3.manager.TweetCriteria()
KYDerby_tweets = got3.manager.TweetManager.getTweets(tweetCriteria)
from afinn import Afinn
afinn = Afinn()
#getting afinn library to use for sentiment polarity analysis
for x in KYDerby_tweets:
Text = x.text
Retweets = x.retweets
Favorites = x.favorites
Date = x.date
Id = x.id
print(Text)
AllText = []
AllRetweets = []
AllFavorites = []
AllDates = []
AllIDs = []
for x in KYDerby_tweets:
Text = x.text
Retweets = x.retweets
Favorites = x.favorites
Date = x.date
AllText.append(Text)
AllRetweets.append(Retweets)
AllFavorites.append(Favorites)
AllDates.append(Date)
AllIDs.append(Id)
data_set = [[x.id, x.date, x.text, x.retweets, x.favorites]
for x in KYDerby_tweets]
df = pd.DataFrame(data=data_set, columns=["Id", "Date", "Text", "Favorites", "Retweets"])
#I now have a DataFrame with my basic info in it
pscore = []
for x in KYDerby_tweets:
afinn.score(x.text)
pscore.append(afinn.score(x.text))
df['P Score'] = pscore
#I now have the pscores for each Tweet in the DataFrame
nrc = pd.read_csv('C:\\users\\andrew.smith\\downloads\\NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt', sep="\t", names=["word", "emotion", "association"], skiprows=45)
#import NRC emotion lexicon
nrc = nrc[nrc["association"]==1]
nrc = nrc[nrc["emotion"].isin(["positive", "negative"]) == False]
#cleaned it up a bit
from nltk import TweetTokenizer
tt = TweetTokenizer()
tokenized = [x.lower() for x in tokenized]
#built my Tweet-specific, NRC-ready tokenizer
emotions = list(set(nrc["emotion"]))
index2emotion = {}
emotion2index = {}
for i in range(len(emotions)):
index2emotion[i] = emotions[i]
emotion2index[emotions[i]] = i
cv = [0] * len(emotions)
#built indices showing locations of emotions
for token in tokenized:
sub = nrc[nrc['word'] == token]
token_emotions = sub['emotion']
for e in token_emotions:
position_index = emotion2index[e]
cv[position_index]+=1
emotions = list(set(nrc['emotion']))
index2emotion = {}
emotion2index = {}
for i in range(len(emotions)):
index2emotion[i] = emotions[i]
emotion2index[emotions[i]] = i
def makeEmoVector(tweettext):
cv = [0] * len(emotions)
tokenized = tt.tokenize(tweettext)
tokenized = [x.lower() for x in tokenized]
for token in tokenized:
sub = nrc[nrc['word'] == token]
token_emotions = sub['emotion']
for e in token_emotions:
position_index = emotion2index[e]
cv[position_index] += 1
return cv
tweettext = df.iloc[14,:]['Text']
emotion_vectors = []
for text in df['Text']:
emotion_vector = makeEmoVector(text)
emotion_vectors.append(emotion_vector)
ev = pd.DataFrame(emotion_vectors, index=df.index, columns=emotions)
#Now I have a DataFrame with all of the emotion counts for each tweet
Date_Group = df.groupby("Date")
Date_Group[emotions].agg("sum")
#Finally, we arrive at the problem! When I run this, I end up with tweets that are grouped *by the second. What I want is to be able to group them: a) by the half-hour, b) by the hour, and c) by the day
Since, the default date format for tweets with the Tweepy API is "2017-04-14 18:41:56". To get tweets grouped by hour, you can do something as simple as this:
# This will get the time parameter
time = [item.split(" ")[1] for item in df['date'].values]
# This will get the hour parameter
hour = [item.split(":")[0] for item in time]
df['time'] = hour
grouped_tweets = df[['time', 'number_tweets']].groupby('time')
tweet_growth_hour = grouped_tweets.sum()
tweet_growth_hour['time']= tweet_growth_hour.index
print tweet_growth_hour
To group by date, you can do something similiar like:
days = [item.split(" ")[0] for item in df['date'].values]
df['days'] = days
grouped_tweets = df[['days', 'number_tweets']].groupby('days')
tweet_growth_days = grouped_tweets.sum()
tweet_growth_days['days']= tweet_growth_days.index
print tweet_growth_days
I'm running this code but I get an indent error. What seems to be the problem? I ran it so many times. When I run the code through IDLE, I get this box that says:
Syntax Error: There's and error in your program: expected an indented
block
Code:
import arcpy
from arcpy import env
import math
folder_path = r"J:\sanda"
# Define workspace as your folder path
env.workspace = folder_path
# Allow overwriting output files
arcpy.env.overwriteOutput = True
#parameters as text
input_lake = arcpy.GetParameterAsText(0)
input_cities = arcpy.GetParameterAsText(1)
output_lake = arcpy.GetParameterAsText(2)
city= "CITY_NAME"
cntry= "CITY_CNTRY"
admin= "ADMIN_NAME"
pop_city= "Population"
dist_km= "Distance"
x_coord= "X_CORD"
y_coord= "Y_CORD"
#copy lakes shapefile
arcpy.CopyFeatures_management(input_lake, output_lake)
#Add fields (city_name, x coord, y coord, etc)
arcpy.AddField_management(output_lake, city, "TEXT")
arcpy.AddField_management(output_lake, cntry , "TEXT")
arcpy.AddField_management(output_lake, admin, "TEXT")
arcpy.AddField_management(output_lake, pop_city, "DOUBLE")
arcpy.AddField_management(output_lake, dist_km, "DOUBLE")
arcpy.AddField_management(output_lake, x_coord, "DOUBLE")
arcpy.AddField_management(output_lake, y_coord, "DOUBLE")
#create empty lists
citylist_city_name = []
citylist_X = []
citylist_Y = []
city_name = []
city_cntry = []
admin_name = []
dist_km= []
pop= []
#populate these lists with values
city_cursor= arcpy.SearchCursor(input_cities)
for city in city_cursor:
geom = city.Shape
citylist_X.append(geom.firstPoint.X)
citylist_Y.append(geom.firstPoint.Y)
citylist_city_name.append(city.CITY_NAME)
city_cntry.append(city.CNTRY_NAME)
admin_name.append(city.ADMIN_NAME)
pop.append(city.Population)
#get the number of cities
city_length = len(citylist_X)
#read lake geometries
lake_cursor = arcpy.UpdateCursor(output_lake)
#loop through each lake
for lake in lake_cursor:
lake_geom = lake.Shape
#initiate lake distances
city_dist_list = []
#loop through each city
for cityID in range(0, city_length - 1):
#get x and y for the current city
cityX=citylist_X[cityID]
cityY=citylist_Y[cityID]
#get x and y for the current lake
lakeX = lake_geom.centroid.X
lakeY = lake_geom.centroid.Y
#calculate the distance
dist = math.sqrt((cityX-lakeX)**2 +(cityY-lakeY)**2
city_dist_list.append(dist)
closest = min(city_dist_list)
closestID = city_dist_list.index(closest)
#set values into the new lake feature
lake.CITY_NAME = citylist_city_name[closestID]
lake.X_CORD = citylist_X [closestID]
lake.Y_CORD = citylist_Y [closestID]
lake.Distance = closest*(0.001)
lake.ADMIN_NAME = admin_name [closestID]
lake.Population = pop [closestID]
lake.city_cntry = city_cntry [closestID]
lake_cursor.updateRow(lake)
#kill the cursors
del city_cursor, lake_cursor, lake, city, cityID, geom, lake_geom
print "Done"
The for-loop in your code is not indented properly, it must be like this :
for cityID in range(0, city_length - 1):
#get x and y for the current city
cityX=citylist_X[cityID]
cityY=citylist_Y[cityID]
#get x and y for the current lake
lakeX = lake_geom.centroid.X
lakeY = lake_geom.centroid.Y
#calculate the distance
dist = math.sqrt((cityX-lakeX)**2 +(cityY-lakeY)**2)
city_dist_list.append(dist)
closest = min(city_dist_list)
closestID = city_dist_list.index(closest)
#set values into the new lake feature
lake.CITY_NAME = citylist_city_name[closestID]
lake.X_CORD = citylist_X [closestID]
lake.Y_CORD = citylist_Y [closestID]
lake.Distance = closest*(0.001)
lake.ADMIN_NAME = admin_name [closestID]
lake.Population = pop [closestID]
lake.city_cntry = city_cntry [closestID]
lake_cursor.updateRow(lake)
Secondly you were missing a closing ) on this line:
dist = math.sqrt((cityX-lakeX)**2 +(cityY-lakeY)**2)