rglob to batch convert CSVs to TSVs - python

I have a folder of hundreds of CSVs that I need to convert to TSV for Postgres upload.
I wrote this script, but nothing seems to happen when I run it. Can anyone see what the issue is?
import os
import sys
import csv
import pandas as pd
import numpy as np
import pathlib
for file in pathlib.Path().rglob('*.csv'):
with open(file,'r') as csvin, open(file + ".tsv", 'w') as tsvout:
csvin = csv.reader(csvin)
tsvout = csv.writer(tsvout, delimiter='\t')
for row in csvin:
tsvout.writerow(row)

You are importing pandas...you could try:
import os
import sys
import csv
import pandas as pd
import numpy as np
import pathlib
for file in pathlib.Path().rglob('*.csv'):
df = pd.from_csv(str(file))
df.to_csv(str(file.with_name(file.stem + ‘.csv’)), sep=‘\t’)

Related

Pandas code running but not doing anything

Below is the code:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
dataset=pd.read_excel('file_path')
sia=SentimentIntensityAnalyzer()
dataset['polarity scores']=dataset['column_title'].apply(lambda x: sia.polarity_scores(str(x))['compound'])
print("done")
I would like it to take the excel file named/located file_path and give me polarity scores for the text in the column entitled column title but I'm not sure what I'm doing wrong.The code runs without any errors but it does not edit the excel file at all
You forgot to save your file
use dataset.to_excel('output_file_path.xlsx') to save your file
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
dataset=pd.read_excel('file_path')
sia=SentimentIntensityAnalyzer()
dataset['polarity scores']=dataset['column_title'].apply(lambda x: sia.polarity_scores(str(x))['compound'])
dataset.to_excel('output_file_path.xlsx', index = False) # save file
print("done")

Writing pdf data into csv

I am a beginner to python and I am stuck in a critical part of my script. The goal is to get fillable pdf data into one single csv file. I have built a script to do so through pdfminer but I am now stuck in the part where I should get my script to write all of the output data into my csv. Instead, I am only getting one data line written in my csv. Any clue please?
Here is my code:
import sys, os
import six
import csv
import numpy as np
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import resolve1
fp = open("C:\\Users\\Sufyan\\Desktop\\test\\ccc.pdf", "rb")
parser = PDFParser(fp)
doc = PDFDocument(parser)
fields = resolve1(doc.catalog["AcroForm"])["Fields"]
for i in fields:
field = resolve1(i)
name, value = field.get("T"), field.get("V")
filehehe = "{0}:{1}".format(name,value)
values = resolve1(value)
names = resolve1(name)
print(values)
with open('test.csv','wb') as f:
for i in names:
f.write(values)
Clues:
There is no i inside this loop:
with open('test.csv','wb') as f:
for i in names:
f.write(values)
For the same for loop, which values does names contain? Are these a lot? 1? 2? Why?

How to handle "OSError: Cannot seek back after getting firstbytes" when trying to extract jpg files into numpy array

I'm trying to collect all images in a zip file into a numpy array.
Always getting an error:
OSError: Cannot seek back after getting firstbytes!
import urllib.request
import os
import zipfile
import scipy
import numpy as np
import pandas as pd
import glob
import imageio
from os.path import splitext
url = 'https://github.com/yoavram/Sign-Language/raw/master/Dataset.zip'
filename = '../data/sign-lang'
if not os.path.exists('../data'):
os.mkdir('../data')
if not os.path.exists(filename):
urllib.request.urlretrieve(url, filename)
zf = zipfile.ZipFile(filename)
for file in zf.namelist():
basename,extension = splitext(file)
if extension == '.jpg':
with zf.open(file) as img_file:
img = imageio.read(img_file)
Help?
I had a similar problem for the longest time.
The problem is that imageio.read() expects bytes but is being provided with a file-type object.
To fix this, simply read the bytes from the file.
img = imageio.read(img_file.read())
Also, if you want numpy arrays you should use imageio.imread()

Save selection of multiple Excel workbooks to one pdf with Python

I want to make a pdf somposed by ranges in all Excel-workbooks located in a given folder (folderwithallfiles). All workbooks will have the same structure so the range reference will be the same for all workbooks.
I thought I got it with the script below, but it does not work.
import win32com.client as win32
import glob
import os
xlfiles = sorted(glob.glob("*.xlsx"))
#print "Reading %d files..."%len(xlfiles)
cwd = "C:\\Users\\user\folderwithallfiles"
#cwd = os.getcwd()
path_to_pdf = r'C:\\Users\\user\folderwithallfiles\multitest.pdf'
excel = win32.gencache.EnsureDispatch('Excel.Application')
for xlfile in xlfiles:
wb = excel.Workbooks.Open(cwd+"\\"+xlfile)
ws = wb.Sheets('sheet 1')
ws.Range("A1:Q59").Select()
wb.ActiveSheet.ExportAsFixedFormat(0, path_to_pdf)
Please check the below code if it works. I have written on the fly. Let me know if you find issues in it.
import pandas as pd
import numpy as np
import glob
import pdfkit as pdf
all_data = pd.DataFrame()
for f in glob.glob("filepath\file*.xlsx"):
df = pd.read_excel(f)
all_data = all_data.append(df, ignore_index=True)
all_data.to_html("filepath\all_data.html)
pdf.from_file("filepath\all_data.html", "filepath\all_data.pdf")

import multiple csv from folder into a dataframe in python

I was trying to import 15 different csv into a data frame.
Installed pyserial and it still shows "cannot import name serial"
Also tried to
"try:
import serial # Python2
except ImportError:
from serial3 import * # Python3"
and still not able to run it.
Here is my code:
from serial import serial
frame = pd.DataFrame()
list_ = []
allfiles = glob.glob(path + "/*.csv")
for file_ in allfiles:
df1 = pd.read.csv(file_, index_col=None)
list_.append(df1)
frame = pd.concat(list_)
print frame
ImportError: cannot import name serial
Thanks everyone in advance !
replace this
from serial import serial
to:
from serial import Serial
^
Capital S
you also need to do this
pd.read.csv(file_, index_col=None)
^
pd.read_csv(file_, index_col=None)
if you haven' imported pandas:
import pandas as pd

Categories

Resources