Any idea why is this always writing the same line in output csv?
21 files = glob.glob(path)
22 csv_file_complete = open("graph_complete_reddit.csv", "wb")
23 stat_csv_file = open("test_stat.csv", "r")
24 csv_reader = csv.reader(stat_csv_file)
25 lemmatizer = WordNetLemmatizer()
26 for file1, file2 in itertools.combinations(files, 2):
27 with open(file1) as f1:
28 print(file1)
29 f1_text = f1.read()
30 f1_words = re.sub("[^a-zA-Z]", ' ', f1_text).lower().split()
31 f1_words = [str(lemmatizer.lemmatize(w, wordnet.VERB)) for w in f1_words if w not in stopwords]
32 print(f1_words)
33 f1.close()
34 with open(file2) as f2:
35 print(file2)
36 f2_text = f2.read()
37 f2_words = re.sub("[^a-zA-Z]", ' ', f2_text).lower().split()
38 f2_words = [str(lemmatizer.lemmatize(w, wordnet.VERB)) for w in f2_words if w not in stopwords]
39 print(f2_words)
40 f2.close()
41
42 a_complete = csv.writer(csv_file_complete, delimiter=',')
43 print("*****")
44 print(file1)
45 print(file2)
46 print("************************************")
47
48 f1_head, f1_tail = os.path.split(file1)
49 print("************")
50 print(f1_tail)
51 print("**************")
52 f2_head, f2_tail = os.path.split(file2)
53 print(f2_tail)
54 print("********************************")
55 for row in csv_reader:
56 if f1_tail in row:
57 file1_file_number = row[0]
58 file1_category_number = row[2]
59 if f2_tail in row:
60 file2_file_number = row[0]
61 file2_category_number = row[2]
62
63 row_complete = [file1_file_number, file2_file_number, file1_category_number, file2_category_number ]
64 a_complete.writerow(row_complete)
65
66 csv_file_complete.close()
Those prints show different filenames!
This is test_stat.csv file which the code uses as input:
1 1,1bmmoc.txt,1
2 2,2b3u1a.txt,1
3 3,2mf64u.txt,2
4 4,4x74k3.txt,5
5 5,lsspe.txt,3
6 6,qbimg.txt,4
7 7,w95fm.txt,2
And here's what the code outputs:
1 7,4,2,5
2 7,4,2,5
3 7,4,2,5
4 7,4,2,5
5 7,4,2,5
6 7,4,2,5
7 7,4,2,5
8 7,4,2,5
9 7,4,2,5
10 7,4,2,5
11 7,4,2,5
12 7,4,2,5
13 7,4,2,5
14 7,4,2,5
15 7,4,2,5
16 7,4,2,5
17 7,4,2,5
18 7,4,2,5
19 7,4,2,5
20 7,4,2,5
21 7,4,2,5
please comment or suggest fixes.
You're never rewinding stat_csv_file, so eventually, your loop over csv_reader (which is a wrapper around stat_csv_file) isn't looping at all, and you write whatever you found on the last loop. Basically, the logic is:
On first loop, look through all of csv_reader, find hit (though you keep looking even when you find it, exhausting the file), write hit
On all subsequent loops, the file is exhausted, so the inner search loop doesn't even execute, and you end up writing the same values as last time
The slow, but direct way to fix this is to add stat_csv_file.seek(0) before you search it:
53 print(f2_tail)
54 print("********************************")
stat_csv_file.seek(0) # Rewind to rescan input from beginning
55 for row in csv_reader:
56 if f1_tail in row:
57 file1_file_number = row[0]
58 file1_category_number = row[2]
59 if f2_tail in row:
60 file2_file_number = row[0]
61 file2_category_number = row[2]
A likely better approach would be to load the input CSV into a dict once, then perform lookup there as needed, avoiding repeated (slow) I/O in favor of fast dict lookup. The cost would be higher memory use; if the input CSV is small enough, that's not an issue, if it's huge, you may need to use a proper database to get the rapid lookup without blowing memory.
It's a little unclear what the logic should be here, since your inputs and outputs don't align (your output should start with a repeated digit, but it doesn't for some reason?). But if the intent is that the input contains file_number, file_tail, category_number, then you could begin your code (above the top level loop) with:
# Create mapping from second field to associated first and third fields
tail_to_numbers = {ftail: (fnum, cnum) for fnum, ftail, cnum in csv_reader}
Then replace:
for row in csv_reader:
if f1_tail in row:
file1_file_number = row[0]
file1_category_number = row[2]
if f2_tail in row:
file2_file_number = row[0]
file2_category_number = row[2]
row_complete = [file1_file_number, file2_file_number, file1_category_number, file2_category_number ]
a_complete.writerow(row_complete)
with the simpler and much faster:
try:
file1_file_number, file1_category_number = tail_to_numbers[f1_tail]
file2_file_number, file2_category_number = tail_to_numbers[f2_tail]
except KeyError:
# One of the tails wasn't found in the lookup dict, so don't output
# (variables would be stale or unset); optionally emit some error to stderr
continue
else:
# Found both tails, output associated values
row_complete = [file1_file_number, file2_file_number, file1_category_number, file2_category_number]
a_complete.writerow(row_complete)
Related
I am completely new to openpyxl and python and I am having a hard time with this issue and i need your help.
JAN FEB MAR MAR YTD 2019 YTD
25 9 57 23 7
61 41 29 5 57
54 34 58 10 7
13 13 63 26 45
31 71 40 40 40
24 38 63 63 47
31 50 43 2 61
68 33 13 9 63
28 1 30 39 71
I have an excel report with the data above. I'd like to search cells for those that contain a specific string (i.e., YTD) and get the column number for YTD column. I want to use the column number to extract data for that column. I do not want to use row and cell reference as the excel file gets updated regularly, thus d column will always move.
def t_PM(ff_sheet1,start_row):
wb = openpyxl.load_workbook(filename='report') # open report
report_sheet1 = wb.get_sheet_by_name('sheet 1')
col = -1
for j, keyword in enumerate(report_sheet1.values(0)):
if keyword=='YTD':
col = j
break
ff_sheet1.cell(row=insert_col + start_row, column= header['YTD_OT'], value=report_sheet1.cell(row=i + 7, column=col).value)
But then, I get an " 'generator' object is not callable" error. How can i fix this?
Your problem is that report_sheet1.values is a generator so you can't call it with (0). I'm assuming by your code that you don't want to rely that the "YTD" will appear in the first row so you iterate all cells. Do this by:
def find_YTD():
wb = openpyxl.load_workbook(filename='report') # open report
report_sheet1 = wb.get_sheet_by_name('sheet 1')
for col in report_sheet1.iter_cols(values_only=True):
for value in col:
if isinstance(value, str) and 'YTD' in value:
return col
If you are assuming this data will be in the first row, simply do:
for cell in report_sheet1[1]:
if isinstance(value, str) and 'YTD' in cell.value:
return cell.column
openpyxl uses '1-based' line indexing
Read the docs - access many cells
I want to copy and write from one file to another a new one column.
I have a file:
1 12 13 14
2 22 23 24
3 32 33 34
4 42 43 44
5 52 53 54
6 62 63 64
I need to copy 4 column to new file.
In the code, you can see that I take the file and delete the first two lines in it. After that, my attempts to create a file with one column.
f=open("1234.txt").readlines()
for i in [0,0,-1]:
f.pop(i)
with open("1234.txt",'w') as F:
F.writelines(f)
ff = open("1234.txt", 'r')
df1 = ff.iloc[:,3:3]
print(df1)
with open('12345.txt', 'w') as F:
df.writelines('12345.txt')
I’m not sure whether to import something for iloc, may be it pandas? Should I close files in code and when?
Hi when I try to print a list, it prints out the directory and not the contents of win.txt. I'm trying to enumerate the txt into a list and split then append it to a, then do other things once get a to print. What am I doing wrong?
import os
win_path = os.path.join(home_dir, 'win.txt')
def roundedStr(num):
return str(int(round(num)))
a=[] # i declares outside the loop for recover later
for i,line in enumerate(win_path):
# files are iterable
if i==0:
t=line.split(' ')
else:
t=line.split(' ')
t[1:6]= map(int,t[1:6])
a.append(t) ## a have all the data
a.pop(0)
print a
prints out directory, like example c:\workspace\win.txt
NOT what I want
I want it to print the contents of win.txt
which takes t[1:6] as integers, like
11 21 31 41 59 21
and prints that out like that same way.
win.txt contains this
05/06/2017 11 21 31 41 59 21 3
05/03/2017 17 18 49 59 66 9 2
04/29/2017 22 23 24 45 62 5 2
04/26/2017 01 15 18 26 51 26 4
04/22/2017 21 39 41 48 63 6 3
04/19/2017 01 19 37 40 52 15 3
04/15/2017 05 22 26 45 61 13 3
04/12/2017 08 14 61 63 68 24 2
04/08/2017 23 36 51 53 60 15 2
04/05/2017 08 20 46 53 54 13 2
I just want [1]-[6]
I think what you want is to open the file 'win.txt', and read its content. Using the open function to create a file object, and a with block to scope it. See my example below. This will read the file, and take the first 6 numbers of each line.
import os
win_path = os.path.join(home_dir, 'win.txt')
a=[] # i declares outside the loop for recover later
with open(win_path, 'r') as file:
for i,line in enumerate(file):
line = line.strip()
print(line)
if i==0:
t=line.split(' ')
else:
t=line.split(' ')
t[1:7]= map(int,t[1:7])
t = t[1:7]
a.append(t) ## a have all the data
a.pop(0)
print (a)
import csv
import output
fill = input("Enter File name:")
f = open(fill)
csv_f = csv.reader(f)
m = open('data.csv', "w")
dict_out = {}
for row in csv_f:
if row[1] in dict_out:
dict_out[row[1]] += row[3]
else:
dict_out[row[1]] = row[3]
for title, value in dict_out.items():
m.write('{},'.format(title))
m.write ('{} \n'.format(value))
m.close()
Prints my csv as
Title,Detail
Siding, 50 63 22 68 138 47 123 107 107 93 117
Asphalt, 49 8 72 19 125 95 33 83 123 144
Rail, 82 98 89 62 58 66 24 77 120 93
Grinding, 127 47 20 66 29 137 33 145 3 98
Concrete, 130 75 12 88 22 137 114 88 143 16
I would like to put a comma in between the numbers. I have tried m.write(',') after m.write('{} \n'.format(value)) but it only adds it after the last one. How can i format it so it will output as
Title,Detail
Siding, 50,63,22,68,138,47,123,107,107,93,117
Asphalt, 49,8,72,191,25,95,33,83,123,144
Rail, 82,98,89,62,58,66,24,77,120,93
Grinding, 127,47,20,66,29,137,33,145,3,98
Concrete, 130,75,12,88,22,137,114,88,143,16
not the best way but you can:
for title, value in dict_out.items():
m.write('{},'.format(title))
m.write ('{} \n'.format(value.replace(' ', ',')))
but you should definetly use csv writter,
import csv
import output
fill = input("Enter File name:")
f = open(fill)
csv_f = csv.reader(f)
c = open('data.csv', "w")
m = csv.writer(c)
dict_out = {}
for row in csv_f:
if row[1] in dict_out:
dict_out[row[1]].append(row[3])
else:
dict_out[row[1]] = [row[3]]
for title, value in dict_out.items():
m.writerow([title] + value)
c.close()
If value is a string then you need to use value.split(). If it is already a list then you don't need to use the split method.
with open('data.csv', "w") as m:
for title, value in dict_out.items():
m.write(title + "," + ",".join(value.split()) + "\n")
For index.csv file, its fourth column has ten numbers ranging from 1-5. Each number can be regarded as an index, and each index corresponds with an array of numbers in filename.csv.
The row number of filename.csv represents the index, and each row has three numbers. My question is about using a nesting loop to transfer the numbers in filename.csv to index.csv.
from numpy import genfromtxt
import numpy as np
import csv
data1 = genfromtxt('filename.csv', delimiter=',')
data2 = genfromtxt('index.csv', delimiter=',')
f = open('index.csv','wb')
write = csv.writer(f, delimiter=',',quoting=csv.QUOTE_ALL)
for row in data2:
for ch_row in data1:
if ( data2[row,3] == ch_row ):
write.writerow(data1[data2[row,3],:])
For example, the fourth column of index.csv contains 1,2,5,3,4,1,4,5,2,3 and filename.csv contains:
# filename.csv
20 30 50
70 60 45
35 26 77
93 37 68
13 08 55
What I need is to write the indexed row from filename.csv to index.csv and store these number in 5th, 6th and 7th column:
# index.csv
# 4 5 6 7
... 1 20 30 50
... 2 70 60 45
... 5 13 08 55
... 3 35 26 77
... 4 93 37 68
... 1 20 30 50
... 4 93 37 68
... 5 13 08 55
... 2 70 60 45
... 3 35 26 77
Can anyone help me solve this problem?
You need to indent your last 2 lines. Also, it looks like you are writing to the file from which you are reading.