Large text file to csv, can't open text file - python

I'm trying to convert this 3,1 GB text file from https://snap.stanford.edu/data/
into a csv file. All the data is structured like:
name: something
age: something
gender: something
which makes it a pretty large text file with some million lines.
I have tried to write a py script to convert it but for some reason it won't read the lines in my for each loop.
Here is the code:
import csv
def trycast(x):
try:
return float(x)
except:
try:
return int(x)
except:
return x
cols = ['product_productId', 'review_userId', 'review_profileName', 'review_helpfulness', 'review_score', 'review_time', 'review_summary', 'review_text']
f = open("movies.txt", "wb")
w = csv.writer(f)
w.writerow(cols)
doc = {}
with open('movies.txt') as infile:
for line in infile:
line = line.strip()
if line=="":
w.writerow([doc.get(col) for col in cols])
doc = {}
else:
idx = line.find(":")
key, value = tuple([line[:idx], line[idx+1:]])
key = key.strip().replace("/", "_").lower()
value = value.strip()
doc[key] = trycast(value)
f.close()
I'm not sure if it is because the document is to large, because a regulare notepad program won't be able to open it.
Thanks up front! :-)

In the line f = open("movies.txt", "wb") you're opening the file for writing, and thereby deleting all its content. Later on, you're trying to read from that same file. It probably works fine if you change the output filename. (I am not going to download 3.1 GB to test it. ;) )

Related

How to edit specific line for all text files in a folder by python?

Here below is my code about how to edit text file.
Since python can't just edit a line and save it at the same time,
I save the previous text file's content into a list first then write it out.
For example,if there are two text files called sample1.txt and sample2.txt in the same folder.
Sample1.txt
A for apple.
Second line.
Third line.
Sample2.txt
First line.
An apple a day.
Third line.
Execute python
import glob
import os
#search all text files which are in the same folder with python script
path = os.path.dirname(os.path.abspath(__file__))
txtlist = glob.glob(path + '\*.txt')
for file in txtlist:
fp1 = open(file, 'r+')
strings = [] #create a list to store the content
for line in fp1:
if 'apple' in line:
strings.append('banana\n') #change the content and store into list
else:
strings.append(line) #store the contents did not be changed
fp2 = open (file, 'w+') # rewrite the original text files
for line in strings:
fp2.write(line)
fp1.close()
fp2.close()
Sample1.txt
banana
Second line.
Third line.
Sample2.txt
First line.
banana
Third line.
That's how I edit specific line for text file.
My question is : Is there any method can do the same thing?
Like using the other functions or using the other data type rather than list.
Thank you everyone.
Simplify it to this:
with open(fname) as f:
content = f.readlines()
content = ['banana' if line.find('apple') != -1 else line for line in content]
and then write value of content to file back.
Instead of putting all the lines in a list and writing it, you can read it into memory, replace, and write it using same file.
def replace_word(filename):
with open(filename, 'r') as file:
data = file.read()
data = data.replace('word1', 'word2')
with open(filename, 'w') as file:
file.write(data)
Then you can loop through all of your files and apply this function
The built-in fileinput module makes this quite simple:
import fileinput
import glob
with fileinput.input(files=glob.glob('*.txt'), inplace=True) as files:
for line in files:
if 'apple' in line:
print('banana')
else:
print(line, end='')
fileinput redirects print into the active file.
import glob
import os
def replace_line(file_path, replace_table: dict) -> None:
list_lines = []
need_rewrite = False
with open(file_path, 'r') as f:
for line in f:
flag_rewrite = False
for key, new_val in replace_table.items():
if key in line:
list_lines.append(new_val+'\n')
flag_rewrite = True
need_rewrite = True
break # only replace first find the words.
if not flag_rewrite:
list_lines.append(line)
if not need_rewrite:
return
with open(file_path, 'w') as f:
[f.write(line) for line in list_lines]
if __name__ == '__main__':
work_dir = os.path.dirname(os.path.abspath(__file__))
txt_list = glob.glob(work_dir + '/*.txt')
replace_dict = dict(apple='banana', orange='grape')
for txt_path in txt_list:
replace_line(txt_path, replace_dict)

how to join incorporate splitted lines with replacing data from a file into the same string

So as most of us are thinking it's a duplicate which is not, so what I'm trying to achieve is let's say there is a Master string like the below and couple of files mentioned in it then we need to open the files and check if there are any other files included in it, if so we need to copy that into the line where we fetched that particular text.
Master String:
Welcome
How are you
file.txt
everything alright
signature.txt
Thanks
file.txt
ABCDEFGHtele.txt
tele.txt
IJKL
signature.txt
SAK
Output:
Welcome
How are you
ABCD
EFGH
IJKL
everything alright
SAK
Thanks
for msplitin [stext.split('\n')]:
for num, items in enumerate(stext,1):
if items.strip().startswith("here is") and items.strip().endswith(".txt"):
gmsf = open(os.path.join(os.getcwd()+"\txt", items[8:]), "r")
gmsfstr = gmsf.read()
newline = items.replace(items, gmsfstr)
How to join these replace items in the same string format.
Also, any idea on how to re-iterate the same function until there are no ".txt". So, once the join is done there might be other ".txt" inside a ".txt.
Thanks for your help in advance.
A recursive approach that works with any level of file name nesting:
from os import linesep
def get_text_from_file(file_path):
with open(file_path) as f:
text = f.read()
return SAK_replace(text)
def SAK_replace(s):
lines = s.splitlines()
for index, l in enumerate(lines):
if l.endswith('.txt'):
lines[index] = get_text_from_file(l)
return linesep.join(lines)
You can try:
s = """Welcome
How are you
here is file.txt
everything alright
here is signature.txt
Thanks"""
data = s.split("\n")
match = ['.txt']
all_matches = [s for s in data if any(xs in s for xs in match)]
for index, item in enumerate(data):
if item in all_matches:
data[index] ="XYZ"
data = "\n".join(data)
print data
Output:
Welcome
How are you
XYZ
everything alright
XYZ
Thanks
Added new requirement:
def file_obj(filename):
fo = open(filename,"r")
s = fo.readlines()
data = s.split("\n")
match = ['.txt']
all_matches = [s for s in data if any(xs in s for xs in match)]
for index, item in enumerate(data):
if item in all_matches:
file_obj(item)
data[index] ="XYZ"
data = "\n".join(data)
print data
file_obj("first_filename")
We can create temporary file object and keep the replaced line in that temporary file object and once everything line is processed then we can replace with the new content to original file. This temporary file will be deleted automatically once its come out from the 'with' statement.
import tempfile
import re
file_pattern = re.compile(ur'(((\w+)\.txt))')
original_content_file_name = 'sample.txt'
"""
sample.txt should have this content.
Welcome
How are you
here is file.txt
everything alright
here is signature.txt
Thanks
"""
replaced_file_str = None
def replace_file_content():
"""
replace the file content using temporary file object.
"""
def read_content(file_name):
# matched file name is read and returned back for replacing.
content = ""
with open(file_name) as fileObj:
content = fileObj.read()
return content
# read the file and keep the replaced text in temporary file object(tempfile object will be deleted automatically).
with open(original_content_file_name, 'r') as file_obj, tempfile.NamedTemporaryFile() as tmp_file:
for line in file_obj.readlines():
if line.strip().startswith("here is") and line.strip().endswith(".txt"):
file_path = re.search(file_pattern, line).group()
line = read_content(file_path) + '\n'
tmp_file.write(line)
tmp_file.seek(0)
# assign the replaced value to this variable
replaced_file_str = tmp_file.read()
# replace with new content to the original file
with open(original_content_file_name, 'w+') as file_obj:
file_obj.write(replaced_file_str)
replace_file_content()

How to open a csv file for reading purpose using mmap in python?

I want to open csv file for reading purpose. But I'm facing some exceptions regarding to that.
I'm using Python 2.7.
main.python-
if __name__ == "__main__":
f = open('input.csv','r+b')
m = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)
reader = csv.DictReader(iter(m.readline, ""))
for read in reader:
num = read['time']
print num
output-
Traceback (most recent call last):
File "/home/PycharmProjects/time_gap_Task/main.py", line 22, in <module>
for read in reader:
File "/usr/lib/python3.4/csv.py", line 109, in __next__
self.fieldnames
File "/usr/lib/python3.4/csv.py", line 96, in fieldnames
self._fieldnames = next(self.reader)
_csv.Error: iterator should return strings, not bytes (did you open the file in text mode?)
How to resolve this error? and how to open csv file using mmap and csv in good manner so code is working perfect?
I know you asked this a while ago, but I actually created a module for myself that does this, because I do a lot of work with large CSV files, and sometimes I need to convert them into dictionaries, based on a key. Below is the code I've been using. Please feel free to modify as needed.
def MmapCsvFileIntoDict(csvFilePath, skipHeader = True, transform = lambda row: row, keySelector = lambda o: o):
"""
Takes a CSV file path and uses mmap to open the file and return a dictionary of the contents keyed
on the results of the keySelector. The default key is the transformed object itself. Mmap is used because it is
a more efficient way to process large files.
The transform method is used to convert the line (converted into a list) into something else. Hence 'transform'.
If you don't pass it in, the transform returns the list itself.
"""
contents = {}
firstline = False
try:
with open(csvFilePath, "r+b") as f:
# memory-map the file, size 0 means whole file
mm = mmap.mmap(f.fileno(), 0)
for line in iter(mm.readline, b''):
if firstline == False:
firstline = True
if skipHeader == True:
continue
row = ''
line = line.decode('utf-8')
line = line.strip()
row = next(csv.reader([line]), '')
if transform != None and callable(transform):
if row == None or row == '':
continue
value = transform(row)
else:
value = row
if callable(keySelector):
key = keySelector(value)
else:
key = keySelector
contents[key] = value
except IOError as ie:
PrintWithTs('Error decomposing the companies: {0}'.format(ie))
return {}
except:
raise
return contents
When you call this method, you have some options.
Assume you have a file that looks like:
Id, Name, PhoneNumber
1, Joe, 7175551212
2, Mary, 4125551212
3, Vince, 2155551212
4, Jane, 8145551212
The easiest way to call it is like this:
dict = MmapCsvFileIntoDict('/path/to/file.csv', keySelector = lambda row: row[0])
What you get back is a dict looking like this:
{ '1' : ['1', 'Joe', '7175551212'], '2' : ['2', 'Mary', '4125551212'] ...
One thing I like to do is create a class or a namedtuple to represent my data:
class CsvData:
def __init__(self, row):
self.Id = int(row[0])
self.Name = row[1].upper()
self.Phone = int(row[2])
And then when I call the method, I pass in a second lambda to transform each row in the file to an object I can work with:
dict = MmapCsvFileIntoDict('/path/to/file.csv', transform = lambda row: CsvData(row), keySelector = lambda o: o.Id)
What I get back that time looks like:
{ 1 : <object instance>, 2 : <object instance>...
I hope this helps! Best of luck
When open a file with the flag b like this:
f = open('input.csv','r+b')
You read the file as bytes and not as string.
So, try to change the flags to r:
f = open('input.csv','r')
if you just want to read data with specific columnes from csv file, just try:
import csv
with open('input.csv') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
print row['time']

Append JSON to file

I am trying to append values to a json file. How can i append the data? I have been trying so many ways but none are working ?
Code:
def all(title,author,body,type):
title = "hello"
author = "njas"
body = "vgbhn"
data = {
"id" : id,
"author": author,
"body" : body,
"title" : title,
"type" : type
}
data_json = json.dumps(data)
#data = ast.literal_eval(data)
#print data_json
if(os.path.isfile("offline_post.json")):
with open('offline_post.json','a') as f:
new = json.loads(f)
new.update(a_dict)
json.dump(new,f)
else:
open('offline_post.json', 'a')
with open('offline_post.json','a') as f:
new = json.loads(f)
new.update(a_dict)
json.dump(new,f)
How can I append data to json file when this function is called?
I suspect you left out that you're getting a TypeError in the blocks where you're trying to write the file. Here's where you're trying to write:
with open('offline_post.json','a') as f:
new = json.loads(f)
new.update(a_dict)
json.dump(new,f)
There's a couple of problems here. First, you're passing a file object to the json.loads command, which expects a string. You probably meant to use json.load.
Second, you're opening the file in append mode, which places the pointer at the end of the file. When you run the json.load, you're not going to get anything because it's reading at the end of the file. You would need to seek to 0 before loading (edit: this would fail anyway, as append mode is not readable).
Third, when you json.dump the new data to the file, it's going to append it to the file in addition to the old data. From the structure, it appears you want to replace the contents of the file (as the new data contains the old data already).
You probably want to use r+ mode, seeking back to the start of the file between the read and write, and truncateing at the end just in case the size of the data structure ever shrinks.
with open('offline_post.json', 'r+') as f:
new = json.load(f)
new.update(a_dict)
f.seek(0)
json.dump(new, f)
f.truncate()
Alternatively, you can open the file twice:
with open('offline_post.json', 'r') as f:
new = json.load(f)
new.update(a_dict)
with open('offline_post.json', 'w') as f:
json.dump(new, f)
This is a different approach, I just wanted to append without reloading all the data. Running on a raspberry pi so want to look after memory. The test code -
import os
json_file_exists = 0
filename = "/home/pi/scratch_pad/test.json"
# remove the last run json data
try:
os.remove(filename)
except OSError:
pass
count = 0
boiler = 90
tower = 78
while count<10:
if json_file_exists==0:
# create the json file
with open(filename, mode = 'w') as fw:
json_string = "[\n\t{'boiler':"+str(boiler)+",'tower':"+str(tower)+"}\n]"
fw.write(json_string)
json_file_exists=1
else:
# append to the json file
char = ""
boiler = boiler + .01
tower = tower + .02
while(char<>"}"):
with open(filename, mode = 'rb+') as f:
f.seek(-1,2)
size=f.tell()
char = f.read()
if char == "}":
break
f.truncate(size-1)
with open(filename, mode = 'a') as fw:
json_string = "\n\t,{'boiler':"+str(boiler)+",'tower':"+str(tower)+"}\n]"
fw.seek(-1, os.SEEK_END)
fw.write(json_string)
count = count + 1

python text reading

datafile = open("temp.txt", "r")
record = datafile.readline()
while record != '':
d1 = datafile.strip("\n").split(",")
print d1[0],float (d1[1])
record = datafile.readline()
datafile.close()
The temp file contains
a,12.7
b,13.7
c,18.12
I can't get output. Please help.
The correct code should be:
with open('temp.txt') as f:
for line in f:
after_split = line.strip("\n").split(",")
print after_split[0], float(after_split[1])
The main reason you're not getting output in your code is that datafile doesn't have a strip() method, and I'm surprised you're not getting exceptions.
I highly suggest you read the Python tutorial - it looks like you're trying to write Python in another language and that is not A Good Thing
You want to call strip and split on the line, not the file.
Replace
d1 = datafile.strip("\n").split(",")
With
d1 = record.strip("\n").split(",")
you operating with file handler, but should work on line
like this d1 = record.strip("\n").split(",")
datafile = open("temp.txt", "r")
record = datafile.readline()
while record != '':
d1 = record.strip("\n").split(",")
print d1[0],float (d1[1])
record = datafile.readline()
datafile.close()
Perhaps the following will work better for you (comments as explanation):
# open file this way so that it automatically closes upon any errors
with open("temp.txt", "r") as f:
data = f.readlines()
for line in data:
# only process non-empty lines
if line.strip():
d1 = line.strip("\n").split(",")
print d1[0], float(d1[1])

Categories

Resources