I have built an inverted index dictionary of text collection and need to compress dictionary using blocked compression k=8 and in posting file,gaps between docids using gamma encoding
def createCompressedIndex(dictionary_uncomp_v1):
for term in dictionary_uncomp_v1.keys():
entry = dictionary_uncomp_v1.get(term)
postingList = []
prevId = 0
pEntry = PostingEntry(0,0,0,0)
for pEntry in entry.postingList:
docId = getGammaCode(pEntry.docId - prevId)
frequency = getGammaCode(pEntry.termFreq)
newPEntry = PostingEntry(docId,frequency,0,0)
postingList.extend(newPEntry)
prevId = pEntry.docId
ptemp = docId+frequency
docFrequency = getGammaCode(entry.docFreq)
entrytemp = ptemp+docFrequency
totalTermFreq = getGammaCode(entry.totTermFreq)
compressedEntry = DictEntry(term,docFrequency,totalTermFreq,postingList)
dictionary_comp_v1[term] = bytearray(entrytemp)
Related
im trying to write a Collaborative Filter for my movie recommendation engine. I need a mapping dictionary for fuzzy matching. I tried to write a movieId-title matrix but i got an exception like that:
Keyerror none of are in the [index] in enumerate line.
I checked matrices, id's etc. But i couldn't find solution. (I am using The Movies Dataset from kaggle).
movies_metadata = pd.DataFrame(pd.read_csv('movies_metadata.csv'))
ratings = pd.DataFrame(pd.read_csv('ratings_small.csv'))
movies = movies_metadata[['id','title']]
movies['movieId'] = movies['id']
movies = movies.drop('id',axis = 1)
ratings = ratings.drop('timestamp', axis=1)
user_nums = len(ratings.userId.unique())
item_nums = len(movies.movieId.unique())
total_count = user_nums * item_nums
rating_zero = total_count - ratings.shape[0]
ratings_count_temp = pd.DataFrame(ratings.groupby('rating').size(), columns = ['count'])
ratings_count = ratings_count_temp.append(pd.DataFrame({'count': rating_zero}, index=[0.0]),verify_integrity=True,).sort_index()
ratings_count['log_count'] = np.log(ratings_count['count'])
movies_count = pd.DataFrame(ratings.groupby('movieId').size(), columns = ['count'])
movies_count.head()
movies_count['count'].quantile(np.arange(1, 0.6, -0.05))
popular_movies = list(set(movies_count.query('count >= 50').index))
popular_ratings = ratings[ratings.movieId.isin(popular_movies)]
users_count = pd.DataFrame(popular_ratings.groupby('userId').size(), columns=['count'])
users_count['count'].quantile(np.arange(1, 0.5, -0.05))
active_users = list(set(users_count.query('count >= 50').index))
popular_active_ratings = popular_ratings[popular_ratings.userId.isin(active_users)]
movie_user_mat = popular_active_ratings.pivot(index='movieId', columns='userId', values='rating').fillna(0)
movie_to_idx = {
movies: i for i, movies in
enumerate(list(movies.set_index('movieId').loc[movie_user_mat.index].title))
}
I am trying to add data that I am reading from a series of JSON files to a Numpy array (or whatever data collection would work best). My idea, is that I want to sort a collection of episodes of a tv show by episode title.
The problem I have encountered, is actually creating the collection from the data.
The intent, is that I want to be able to have a collection of the items found within the for loop [a,b,c,d]; for each episode of the show.
Is a Numpy array the best way to go about making this collection, or should I use something else?
season1 = open('THEJSONFILES\seasonone.json', 'r')
season_array = np.array(['episodeTitle','seasonNum', 'episodeNum', 'plotContents'])
def ReadTheDarnJsonFile(jsonTitle):
seasondata = jsonTitle.read()
seasonobj = j.loads(seasondata)
list = (seasonobj['episodes'])
for i in range(len(list)):
a = str(list[i].get('title'))
b = str(list[i].get('seasonNumber'))
c = str(list[i].get('episodeNumber'))
d = str(list[i].get('plot'))
print(a, b, c, d)
print("----------------")
# np.append(season_array, [a,b,c,d]) this is not correct
ReadTheDarnJsonFile(season1)
print(season_array)
2 notes. First I would avoid using list as a variable name because it is a keyword in python. Second I would recommend using a custom class for your data for maximum readability.
season1 = open('THEJSONFILES\seasonone.json', 'r')
season_array = np.array(['episodeTitle','seasonNum', 'episodeNum', 'plotContents'])
class episode:
def __init__(self,title,seasonNumber,episodeNumber,plot):
self.title = title
self.seasonNumber = seasonNumber
self.episodeNumber = episodeNumber
self.plot = plot
def summary(self):
print("Season "+str(self.seasonNumber)+" Episode "+str(self.episodeNumber))
print(self.title)
print(self.plot)
def ReadTheDarnJsonFile(jsonTitle):
seasondata = jsonTitle.read()
seasonobj = j.loads(seasondata)
episodes = (seasonobj['episodes'])
season_array = []
for i in range(len(episodes)):
a = str(list[i].get('title'))
b = str(list[i].get('seasonNumber'))
c = str(list[i].get('episodeNumber'))
d = str(list[i].get('plot'))
season_array.append(episode(a,b,c,d)) this is not correct
return season_array
season_array = Read
TheDarnJsonFile(season1)
for item in season_array:
item.summary()
Here is what I ended up doing.
import json as j
import pandas as pd
emptyArray = []
season1 = open('THEJSONFILES\seasonone.json', 'r')
season2 = open('THEJSONFILES\seasontwo.json', 'r')
season3 = open('THEJSONFILES\seasonthree.json', 'r')
season4 = open('THEJSONFILES\seasonfour.json', 'r')
season5 = open('THEJSONFILES\seasonfive.json', 'r')
season6 = open('THEJSONFILES\seasonsix.json', 'r')
season7 = open('THEJSONFILES\seasonseven.json', 'r')
columnData = ["episodeTitle", "seasonIndex", "episodeIndex", "plot", "imageURL"]
finalDf = pd.DataFrame
def ReadTheDarnJsonFile(jsonTitle):
df = pd.DataFrame(columns = columnData)
seasonData = jsonTitle.read()
seasonObj = j.loads(seasonData)
currentSeasonList = (seasonObj['episodes'])
for i in range(len(currentSeasonList)):
tempTitle = str(currentSeasonList[i].get('title'))
tempSN = str(currentSeasonList[i].get('seasonNumber'))
tempEN = str(currentSeasonList[i].get('episodeNumber'))
tempPlot = str(currentSeasonList[i].get('plot'))
tempImage = str(currentSeasonList[i].get('image'))
dataObj = pd.Series([tempTitle, tempSN, tempEN, tempPlot, tempImage], index=(df.columns))
df.loc[i] = dataObj
emptyArray.append(df)
ReadTheDarnJsonFile(season1)
ReadTheDarnJsonFile(season2)
ReadTheDarnJsonFile(season3)
ReadTheDarnJsonFile(season4)
ReadTheDarnJsonFile(season5)
ReadTheDarnJsonFile(season6)
ReadTheDarnJsonFile(season7)
finalDf = pd.concat(emptyArray)
print(emptyArray)
holyOutput = finalDf.sort_values(by=['episodeTitle'])
holyOutput.reset_index(inplace=True)
holyOutput.to_json("P:\\ProjectForStarWarsCloneWarsJson\JSON\OutputJsonV2.json")
I've been trying to set up a script to automatically assign connector displacement boundary conditions. When I run the script it all looks fine in the GUI (wires are created, BCs are created and assigned the right value), but when I submit I get the following error: "Element connectivity is missing for element x of type "CONN3D2" and the element connectivity is in fact missing in the input file. I assign the edges by using the midpoints between the wire start and ends, but for some reason it doesn't assign them to the elements. This is my connector assignment function:
def assignConnectors(self):
p = self.m.parts[self.partName]
a = self.m.rootAssembly
a.Instance(name=self.instanceName, part=p, dependent=ON)
e = a.edges
n = a.instances[self.instanceName].nodes
#allelements = p.Set(name='allElements', elements=self.listObjElem)
elset = a.instances[self.instanceName].elements
elsetAssembly = a.Set('assemblyElements', elements=elset)
a.regenerate()
v1 = a.instances[self.instanceName].vertices
rows = len(self.listConstraints)
columns = len(self.listConstraints[0])
total = rows*columns
listObjNode=[];
self.listObjElem=[];
self.listObjConnector=[];
for j,pairElem in enumerate(self.listElem):
p1 = a.getCoordinates(self.listNodes[pairElem[0]-1])
p2 = a.getCoordinates(self.listNodes[pairElem[1]-1])
#print(p1,p2)
wires = a.WirePolyLine(points=((p1,p2),), mergeType=IMPRINT, meshable=OFF)
a.regenerate()
pt1 = a.getCoordinates(self.listNodes[pairElem[0]-1])
pt2 = a.getCoordinates(self.listNodes[pairElem[1]-1])
print(pt1,pt2)
pt11 = np.asarray(pt1[0])
pt12 = np.asarray(pt1[1])
pt13 = np.asarray(pt1[2])
pt21 = np.asarray(pt2[0])
pt22 = np.asarray(pt2[1])
pt23 = np.asarray(pt2[2])
new_p1 = (pt11+pt21)/2
new_p2 = (pt12+pt22)/2
new_p3 = (pt13+pt23)/2
new_p = tuple([new_p1,new_p2,new_p3])
print(new_p)
a = self.m.rootAssembly
e = a.edges
edges1 = e.findAt((new_p, ))
print(edges1)
region = a.Set(edges = edges1, name='Set'+str(j))
self.m.ConnectorSection(name='ConnSect-1'+str(j),translationalType=AXIAL)
csa = a.SectionAssignment(sectionName='ConnSect-1'+str(j), region=region)
self.m.ConnDisplacementBC(name='BC-'+str(j+total), createStepName=self.stepName, fastenerSetName='Set'+str(j), u1=float(self.listElongations[j]), u2=UNSET, u3=UNSET, ur1=UNSET, ur2=UNSET, ur3=UNSET, amplitude=UNSET, fixed=OFF, distributionType=UNIFORM)
a.regenerate()
Am I assigning the elements wrong somehow?
Thanks a lot for any help!
I'm working on a web scraping project, and have all the right code that returns me the json data in the format that I want if I used the #print command below, but when I got to run the same code except through Pandas Dataframe it only returns the first row of Data that I'm looking for. Just running the print, it returns the expected 17 rows of data I'm looking for. Dataframe to CSV gives me the first row only. Totally stumped! So grateful for anyone's help!
for item in response['body']:
DepartureDate = item['legs'][0][0]['departDate']
ReturnDate = item['legs'][1][0]['departDate']
Airline = item['legs'][0][0]['airline']['code']
Origin = item['legs'][0][0]['depart']
Destination = item['legs'][0][0]['destination']
OD = (Origin + Destination)
TrueBaseFare = item['breakdown']['baseFareAmount']
YQYR = item['breakdown']['fuelSurcharge']
TAX = item['breakdown']['totalTax']
TTL = item['breakdown']['totalFareAmount']
MARKEDUPTTL = item['breakdown']['totalCalculatedFareAmount']
MARKUP = ((MARKEDUPTTL - TTL) / (TTL)*100)
FBC = item['fareBasisCode']
#print(DepartureDate,ReturnDate,Airline,OD,TrueBaseFare,YQYR,TAX,TTL,MARKEDUPTTL,MARKUP,FBC)
MI = pd.DataFrame(
{'Dept': [DepartureDate],
'Ret': [ReturnDate],
'AirlineCode': [Airline],
'Routing': [OD],
'RealFare': [TrueBaseFare],
'Fuel': [YQYR],
'Taxes': [TAX],
'RealTotal': [TTL],
'AgencyTotal': [MARKEDUPTTL],
'Margin': [MARKUP],
'FareBasis': [FBC],
})
df = pd.DataFrame(MI)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
df.to_csv('MITest7.csv')
When you print all your values after the cycle, you will see that you get only the last values. To resolve this problem you need to create lists and put there your values.
Try this:
DepartureDate = []
ReturnDate = []
Airline = []
Origin = []
Destination = []
OD = []
TrueBaseFare = []
YQYR = []
TAX = []
TTL = []
MARKEDUPTTL = []
MARKUP = []
FBC = []
for item in response['body']:
DepartureDate.append(item['legs'][0][0]['departDate'])
ReturnDate.append(item['legs'][1][0]['departDate'])
Airline.append(item['legs'][0][0]['airline']['code'])
Origin.append(item['legs'][0][0]['depart'])
Destination.append(item['legs'][0][0]['destination'])
OD.append((Origin[-1] + Destination[-1]))
TrueBaseFare.append(item['breakdown']['baseFareAmount'])
YQYR.append(item['breakdown']['fuelSurcharge'])
TAX.append(item['breakdown']['totalTax'])
TTL.append(item['breakdown']['totalFareAmount'])
MARKEDUPTTL.append(item['breakdown']['totalCalculatedFareAmount'])
MARKUP.append(((MARKEDUPTTL[-1] - TTL[-1]) / (TTL[-1])*100))
FBC.append(item['fareBasisCode'])
I have a code for a VoiceActivityDetector and want to give out the value speech_ratio which is in a function
I tried to set up a new function to print out the value
def __init__(self, wave_input_filename):
self._read_wav(wave_input_filename)._convert_to_mono()
self.sample_window = 0.02 #20 ms
self.sample_overlap = 0.01 #10ms
self.speech_window = 0.5 #half a second
self.speech_energy_threshold = 0.6 #60% of energy in voice band
self.speech_start_band = 300
self.speech_end_band = 3000
#self.speech_ratio = 0
def detect_speech(self):
""" Detects speech regions based on ratio between speech band energy
and total energy.
Output is array of window numbers and speech flags (1 - speech, 0 - nonspeech).
"""
detected_windows = np.array([])
sample_window = int(self.rate * self.sample_window)
sample_overlap = int(self.rate * self.sample_overlap)
data = self.data
sample_start = 0
start_band = self.speech_start_band
end_band = self.speech_end_band
while (sample_start < (len(data) - sample_window)):
sample_end = sample_start + sample_window
if sample_end>=len(data): sample_end = len(data)-1
data_window = data[sample_start:sample_end]
energy_freq = self._calculate_normalized_energy(data_window)
sum_voice_energy = self._sum_energy_in_band(energy_freq, start_band, end_band)
sum_full_energy = sum(energy_freq.values())
speech_ratio = sum_voice_energy/sum_full_energy
#self.speech_ratio2 = speech_ratio
# Hipothesis is that when there is a speech sequence we have ratio of energies more than Threshold
speech_ratio = speech_ratio>self.speech_energy_threshold
detected_windows = np.append(detected_windows,[sample_start, speech_ratio])
sample_start += sample_overlap
detected_windows = detected_windows.reshape(int(len(detected_windows)/2),2)
detected_windows[:,1] = self._smooth_speech_detection(detected_windows)
return detected_windows
def printing(self):
print(self.speech_ratio)
return self.speech_ratio
When I set speech_ratio as a variable in the init it does not change the variable later on in the detect_speech function.
If I do not initialize speech_ratio in the init function it wont be a attribute of my object at all.
You use self.speech_ratio to try and print the value; you should use the same expression to assign to it.