I have code :
fileAspect = []
id_filename = -1
for filename in glob.glob(os.path.join(path, '*.dat')):
id_filename += 1
f = open(filename, 'r')
content = f.read()
soup = BeautifulSoup(content, 'html.parser')
all_paragraph = (soup.find_all('content'))
result_polarity = []
for all_sentences in all_paragraph:
sentences = nlp.parse(all_sentences.get_text())
for sentence in sentences['sentences']:
for words in sentence['dependencies']:
if(words[1]<>"ROOT"):
result_proccess = proccess1(myOntology, words) if(result_proccess):
result_polarity.append(words)
ini+=1
else:
out+=1
ont = ''
ont = myOntology
for idx_compare1 in range(len(ont)):
for idy in range(len(result_polarity)):
for idw in range(len(result_polarity[idy])):
if(result_polarity[idy][1]<>"ROOT" and idw<>0):
try:
closest = model.similarity(ont[idx_compare1][0], result_polarity[idy][idw])
if closest >= 0.1 :
valpolar = valuePol(result_polarity[idy][idw])
if(valpolar==1):
ont[idx_compare1][2]+=1
elif(valpolar==2):
ont[idx_compare1][3]+=1
tp += 1
break
except Exception as inst:
fn += 1
break
print "^^^^^^^^^^^^^"
print id_filename
print ont
fileAspect.append(ont)
print 'overall'
#print fileAspect
for aspek in fileAspect:
print aspek
print "---"
Why the result is replace previous data
sample i hope :
a = []
a.append('q')
a.append('w')
a.append('e')
print a
the result is :
['q','w','e']
but with my previous code i got :
['e','e','e']
so the problem is, when i append in variable array "fileAspect", it's didn't only append but also replace the previous code. i use simple code and it's fine, but when it's come in my code, i very confused.. more than 1 hour i try but..
Thanks, any help i'am very apreciated..
Related
I am new to python, but fairly experienced in programming. While learning python I was trying to create a simple function that would read words in from a text file (each line in the text file is a new word) and then check if the each word has the letter 'e' or not. The program should then count the amount of words that don't have the letter 'e' and use that amount to calculate the percentage of words that don't have an 'e' in the text file.
I am running into a problem where I'm very certain that my code is right, but after testing the output it is wrong. Please help!
Here is the code:
def has_n_e(w):
hasE = False
for c in w:
if c == 'e':
hasE = True
return hasE
f = open("crossword.txt","r")
count = 0
for x in f:
word = f.readline()
res = has_n_e(word)
if res == False:
count = count + 1
iAns = (count/113809)*100 //113809 is the amount of words in the text file
print (count)
rAns = round(iAns,2)
sAns = str(rAns)
fAns = sAns + "%"
print(fAns)
Here is the code after doing some changes that may help:
def has_n_e(w):
hasE = False
for c in w:
if c == 'e':
hasE = True
return hasE
f = open("crossword.txt","r").readlines()
count = 0
for x in f:
word = x[:-1]
res = has_n_e(word)# you can use ('e' in word) instead of the function
if res == False:
count = count + 1
iAns = (count/len(f))*100 //len(f) #is the amount of words in the text file
print (count)
rAns = round(iAns,2)
sAns = str(rAns)
fAns = sAns + "%"
print(fAns)
Hope this will help
Hello now im working on my project. I want to get candidate of text block by using algorithm below.
My input is a csv document which contain :
HTML column : the html code in a line
TAG column : the tag of html code in a line
Words : the text inside the tag in aline
TC : the number of words in a line
LTC : the number of anchor words in a line
TG : the number of tag in a line
P : the number of tag p and br in a line
CTTD : TC + (0.2*LTC) + TG - P
CTTDs : the smoothed CTTD
This is my algorithm to find candidate of text block. I make the csv file into dataframe using pandas. I am using CTTDs,TC and TG column to find the candidate.
from ListSmoothing import get_filepaths_smoothing
import pandas as pd
import numpy as np
import csv
filenames = get_filepaths_smoothing(r"C:\Users\kimhyesung\PycharmProjects\newsextraction\smoothing")
index = 0
for f in filenames:
file_html=open(str(f),"r")
df = pd.read_csv(file_html)
#df = pd.read_csv('smoothing/Smoothing001.csv')
news = np.array(df['CTTDs'])
new = np.array(df['TG'])
minval = np.min(news[np.nonzero(news)])
maxval = np.max(news[np.nonzero(news)])
j = 0.2
thetaCTTD = minval + j * (maxval-minval)
#maxGap = np.max(new[np.nonzero(new)])
#minGap = np.min(new[np.nonzero(new)])
thetaGap = np.min(new[np.nonzero(new)])
#print thetaCTTD
#print maxval
#print minval
#print thetaGap
def create_candidates(df, thetaCTTD, thetaGAP):
k = 0
TB = {}
TC = 0
for index in range(0, len(df) - 1):
start = index
if df.ix[index]['CTTDs'] > thetaCTTD:
start = index
gap = 0
TC = df.ix[index]['TC']
for index in range(index + 1, len(df) - 1):
if df.ix[index]['TG'] == 0:
continue
elif df.ix[index]['CTTDs'] <= thetaCTTD and gap >= thetaGAP:
break
elif df.ix[index]['CTTDs'] <= thetaCTTD:
gap += 1
TC += df.ix[index]['TC']
if (TC < 1) or (start == index):
continue
TB.update({
k: {
'start': start,
'end': index - 1
}
})
k += 1
return TB
def get_unique_candidate(TB):
TB = tb.copy()
for key, value in tb.iteritems():
if key == len(tb) - 1:
break
if value['end'] == tb[key+1]['end']:
del TB[key+1]
elif value['start'] < tb[key+1]['start'] < value['end']:
TB[key]['end'] = tb[key+1]['start'] - 1
else:
continue
return TB
index += 1
stored_file = "textcandidate/textcandidate" + '{0:03}'.format(index) + ".csv"
tb = create_candidates(df, thetaCTTD, thetaGap)
TB = get_unique_candidate(tb)
filewrite = open(stored_file, "wb")
df_list = []
for (k, d) in TB.iteritems():
candidate_df = df.loc[d['start']:d['end']]
candidate_df['candidate'] = k
df_list.append(candidate_df)
output_df = pd.concat(df_list)
output_df.to_csv(stored_file)
writer = csv.writer(filewrite, lineterminator='\n')
filewrite.close
ThetaCTTD is 10.36 and thethaGap is 1.
The output is
The output means there are 2 candidates of text block . First the candiate of text block start from line number 215 and end line number 225 (like the pict bellow). And the other candidate of text block start from line number 500 and end line number 501.
My question is how to save the output into csv and not only the number of line but the range of the text block and the others column will appear as the output too?
My expected output is like the screenshot of candidate text block is like this one
Assuming your output is a list of dictionaries:
pd.concat([df.loc[d['start']:d['end']] for (k, d) in TB.iteritems()])
Note that we slice by label, so d['end'] will be included.
Edit: add the candidate number in a new column.
It's cleaner to write a loop than to do two concat operations:
df_list = []
for (k, d) in TB.iteritems():
candidate_df = df.loc[d['start']:d['end']]
candidate_df['candidate'] = k
df_list.append(candidate_df)
output_df = pd.concat(df_list)
It's also faster to concatenate all dataframes at once at the end.
I am scraping search results from a website where each result is contained in a and has a range of data associated with it. However some of these data values are missing and when they are, the error ''NoneType' object has no attribute 'text'' is returned.
I have put in a try/except block. Currently the entire search result is skipped when one of the values is missing. What can I do to allow the missing values to be replaced with a "", or blank in the xls file I am saving to?
My code is below:
divs = soup.find_all("div", class_="result-item standard") + soup.find_all("div", class_="result-item standard basic-ad")
for div in divs:
try:
#item_title = " ".join(div.h2.a.text.split())
item = div.h2.a.text.split()
item_year = item[0]
item_make = item[1]
item_model = ""
for i in range (2,len(item)):
item_model = item_model + item[i] + " "
item_eng = div.find("li", "item-engine").text
item_trans = div.find("li", "item-transmission").text
item_body = div.find("li", "item-body").text
item_odostr = div.find("li", "item-odometer").text
item_odo = ''.join(c for c in item_odostr if c.isdigit())
item_pricestr = " ".join(div.find("div", "primary-price").text.split())
item_price = ''.join(c for c in item_pricestr if c.isdigit())
item_adtype = div.find("div", "ad-type").span.text
#item_distance = div.find("a", "distance-from-me-link").text
item_loc = div.find("div", "call-to-action").p.text
item_row = (str(x),item_year,item_make,item_model,item_eng,item_trans,item_body,item_odo,item_price,item_adtype,item_loc)
print ",".join(item_row)
print(" ")
for i in range(len(item_row)):
ws.write(x,i,item_row[i])
if x % 500 == 0 :
wb.save("data.xls")
except AttributeError as e:
with open("error"+str(x)+".txt", "w+") as error_file:
error_file.write(div.text.encode("utf-8"))
for example:
item_eng = div.find("li", "item-engine").text if div.find("li", "item-engine") else ''
or:
item_eng = div.find("li", "item-engine").text if len(div.find_all("li", "item-engine"))!=0 else ''
i am trying to get the exact word match from my file along with their line no.
like when i search for abc10 it gives me all the possible answers e.g abc102 abc103 etc
how can i limitize my code to only print what i commanded..
here is my code!
lineNo = 0
linesFound = []
inFile= open('rxmop.txt', 'r')
sKeyword = input("enter word ")
done = False
while not done :
pos = inFile.tell()
sLine = inFile.readline()
if sLine == "" :
done = True
break
if (sLine.find( sKeyword ) != -1):
print ("Found at line: "+str(lineNo))
tTuple = lineNo, pos
linesFound.append( tTuple )
lineNo = lineNo + 1
done = False
while not done :
command = int( input("Enter the line you want to view: ") )
if command == -1 :
done = True
break
for tT in linesFound :
if command == tT[0] :
inFile.seek( tT[1] )
lLine = inFile.readline()
print ("The line at position " + str(tT[1]) + "is: " + lLine)
"like when i search for abc10 it gives me all the possible answers e.g abc102 abc103 etc"
You split each record and compare whole "words" only.
to_find = "RXOTG-10"
list_of_possibles = ["RXOTG-10 QTA5777 HYB SY G12",
"RXOTG-100 QTA9278 HYB SY G12"]
for rec in list_of_possibles:
words_list=rec.strip().split()
if to_find in words_list:
print "found", rec
else:
print " NOT found", rec
I have an HTML table that I need to parse into a CSV file.
import urllib2, datetime
olddate = datetime.datetime.strptime('5/01/13', "%m/%d/%y")
from BeautifulSoup import BeautifulSoup
print("dates,location,name,url")
def genqry(arga,argb,argc,argd):
return arga + "," + argb + "," + argc + "," + argd
part = 1
row = 1
contenturl = "http://www.robotevents.com/robot-competitions/vex-robotics-competition"
soup = BeautifulSoup(urllib2.urlopen(contenturl).read())
table = soup.find('table', attrs={'class': 'catalog-listing'})
rows = table.findAll('tr')
for tr in rows:
try:
if row != 1:
cols = tr.findAll('td')
for td in cols:
if part == 1:
keep = 0
dates = td.find(text=True)
part = 2
if part == 2:
location = td.find(text=True)
part = 2
if part == 3:
name = td.find(text=True)
for a in tr.findAll('a', href=True):
url = a['href']
# Compare Dates
if len(dates) < 6:
newdate = datetime.datetime.strptime(dates, "%m/%d/%y")
if newdate > olddate:
keep = 1
else:
keep = 0
else:
newdate = datetime.datetime.strptime(dates[:6], "%m/%d/%y")
if newdate > olddate:
keep = 1
else:
keep = 0
if keep == 1:
qry = genqry(dates, location, name, url)
print(qry)
row = row + 1
part = 1
else:
row = row + 1
except (RuntimeError, TypeError, NameError):
print("Error: " + name)
I need to be able to get every VEX Event in that table that is after 5/01/13. So far, this code gives me an error about the dates, that I can't seem to be able to fix. Maybe someone that is better than me can fix this code? Thanks in advance, Smith.
EDIT #1: The Error That I am Getting Is:
Value Error: '\n10/5/13' does not match format '%m/%d/%y'
I think that I need to remove newlines at the beginning of the string first.
EDIT #2: Got it to run, without any output, any help?
Your question is very poor. Without knowing what the exact error, I would guess the problem is with your if len(dates) < 6: block. Consider the following:
>>> date = '10/5/13 - 12/14/13'
>>> len(date)
18
>>> date = '11/9/13'
>>> len(date)
7
>>> date[:6]
'11/9/1'
One suggestion to make your code more Pythonic: Instead of doing row = row + 1, use enumerate.
Update: Tracing your code, I get the value of dates as follows:
>>> dates
u'\n10/5/13 - 12/14/13 \xa0\n '