How to read fields from JSON-LD to CSV? - python

I am trying to extract values from json ld to csv as they are in the file. There are a couple of issues I am facing.
1. The values being read for different fields are getting truncated in most of the cases. In the remaining cases the value of some other field is appearing in some other field.
2. I am also getting an error - 'Additional data' after some 4,000 lines.
The file is quite big(half a gb). I am attaching a shortened version of my code. Please tell me where am I going wrong.
The input file - I have shortened it and kept it here. There was no way of putting it here.
https://github.com/Architsi/json-ld-issue
I tried writing this script and I tried multiple online converters too
import csv, sys, math, operator, re, os, json, ijson
from pprint import pprint
filelist = []
for file in os.listdir("."):
if file.endswith(".json"):
filelist.append(file)
for input in filelist:
newCsv = []
splitlist = input.split(".")
output = splitlist[0] + '.csv'
newFile = open(output, 'w', newline='') #wb for windows, else you'll see newlines added to csv
# initialize csv writer
writer = csv.writer(newFile)
#Name of the columns
header_row = ('Format', 'Description', 'Object', 'DataProvider')
writer.writerow(header_row)
with open(input, encoding="utf8") as json_file:
data = ijson.items(json_file, 'item')
#passing all the values through try except
for s in data:
source = s['_source']
try:
source_resource = source['sourceResource']
except:
print ("Warning: No source resource in record ID: " + id)
try:
data_provider = source['dataProvider'].encode()
except:
data_provider = "N/A"
try:
_object = source['object'].encode()
except:
_object = "N/A"
try:
descriptions = source_resource['description']
string = ""
for item in descriptions:
if len(descriptions) > 1:
description = item.encode() #+ " | "
else:
description = item.encode()
string = string + description
description = string.encode()
except:
description = "N/A"
created = ""
#writing it to csv
write_tuple = ('format', description, _object, data_provider)
writer.writerow(write_tuple)
print ("File written to " + output)
newFile.close()
The error that I am getting is this- raise common.JSONError('Additional Data')
Expected result is a csv file with all the columns and correct values

Related

Can't figure out how to properly output my data

I'm a relative novice at python but yet, somehow managed to build a scraper for Instagram. I now want to take this one step further and output the 5 most commonly used hashtags from an IG profile into my CSV output file.
Current output:
I've managed to isolate the 5 most commonly used hashtags, but I get this result in my csv:
[('#striveforgreatness', 3), ('#jamesgang', 3), ('#thekidfromakron',
2), ('#togetherwecanchangetheworld', 1), ('#halloweenchronicles', 1)]
Desired output:
What I'm looking to end up with in the end is having 5 columns at the end of my .CSV outputting the X-th most commonly used value.
So something in the lines of this:
I've Googled for a while and managed to isolate them separately, but I always end up with '('#thekidfromakron', 2)' as an output. I seem to be missing some part of the puzzle :(.
Here is what I'm working with at the moment:
import csv
import requests
from bs4 import BeautifulSoup
import json
import re
import time
from collections import Counter
ts = time.gmtime()
def get_csv_header(top_numb):
fieldnames = ['USER','MEDIA COUNT','FOLLOWERCOUNT','TOTAL LIKES','TOTAL COMMENTS','ER','ER IN %', 'BIO', 'ALL CAPTION TEXT','HASHTAGS COUNTED','MOST COMMON HASHTAGS']
return fieldnames
def write_csv_header(filename, headers):
with open(filename, 'w', newline='') as f_out:
writer = csv.DictWriter(f_out, fieldnames=headers)
writer.writeheader()
return
def read_user_name(t_file):
with open(t_file) as f:
user_list = f.read().splitlines()
return user_list
if __name__ == '__main__':
# HERE YOU CAN SPECIFY YOUR USERLIST FILE NAME,
# Which contains a list of usernames's BY DEFAULT <current working directory>/userlist.txt
USER_FILE = 'userlist.txt'
# HERE YOU CAN SPECIFY YOUR DATA FILE NAME, BY DEFAULT (data.csv)', Where your final result stays
DATA_FILE = 'users_with_er.csv'
MAX_POST = 12 # MAX POST
print('Starting the engagement calculations... Please wait until it finishes!')
users = read_user_name(USER_FILE)
""" Writing data to csv file """
csv_headers = get_csv_header(MAX_POST)
write_csv_header(DATA_FILE, csv_headers)
for user in users:
post_info = {'USER': user}
url = 'https://www.instagram.com/' + user + '/'
#for troubleshooting, un-comment the next two lines:
#print(user)
#print(url)
try:
r = requests.get(url)
if r.status_code != 200:
print(timestamp,' user {0} not found or page unavailable! Skipping...'.format(user))
continue
soup = BeautifulSoup(r.content, "html.parser")
scripts = soup.find_all('script', type="text/javascript", text=re.compile('window._sharedData'))
stringified_json = scripts[0].get_text().replace('window._sharedData = ', '')[:-1]
j = json.loads(stringified_json)['entry_data']['ProfilePage'][0]
timestamp = time.strftime("%d-%m-%Y %H:%M:%S", ts)
except ValueError:
print(timestamp,'ValueError for username {0}...Skipping...'.format(user))
continue
except IndexError as error:
# Output expected IndexErrors.
print(timestamp, error)
continue
if j['graphql']['user']['edge_followed_by']['count'] <=0:
print(timestamp,'user {0} has no followers! Skipping...'.format(user))
continue
if j['graphql']['user']['edge_owner_to_timeline_media']['count'] <12:
print(timestamp,'user {0} has less than 12 posts! Skipping...'.format(user))
continue
if j['graphql']['user']['is_private'] is True:
print(timestamp,'user {0} has a private profile! Skipping...'.format(user))
continue
media_count = j['graphql']['user']['edge_owner_to_timeline_media']['count']
accountname = j['graphql']['user']['username']
followercount = j['graphql']['user']['edge_followed_by']['count']
bio = j['graphql']['user']['biography']
i = 0
total_likes = 0
total_comments = 0
all_captiontext = ''
while i <= 11:
total_likes += j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_liked_by']['count']
total_comments += j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_media_to_comment']['count']
captions = j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_media_to_caption']
caption_detail = captions['edges'][0]['node']['text']
all_captiontext += caption_detail
i += 1
engagement_rate_percentage = '{0:.4f}'.format((((total_likes + total_comments) / followercount)/12)*100) + '%'
engagement_rate = (((total_likes + total_comments) / followercount)/12*100)
#isolate and count hashtags
hashtags = re.findall(r'#\w*', all_captiontext)
hashtags_counted = Counter(hashtags)
most_common = hashtags_counted.most_common(5)
with open('users_with_er.csv', 'a', newline='', encoding='utf-8') as data_out:
print(timestamp,'Writing Data for user {0}...'.format(user))
post_info["USER"] = accountname
post_info["FOLLOWERCOUNT"] = followercount
post_info["MEDIA COUNT"] = media_count
post_info["TOTAL LIKES"] = total_likes
post_info["TOTAL COMMENTS"] = total_comments
post_info["ER"] = engagement_rate
post_info["ER IN %"] = engagement_rate_percentage
post_info["BIO"] = bio
post_info["ALL CAPTION TEXT"] = all_captiontext
post_info["HASHTAGS COUNTED"] = hashtags_counted
csv_writer = csv.DictWriter(data_out, fieldnames=csv_headers)
csv_writer.writerow(post_info)
""" Done with the script """
print('ALL DONE !!!! ')
The code that goes before this simply scrapes the webpage, and compiles all the captions from the last 12 posts into "all_captiontext".
Any help to solve this (probably simple) issue would be greatly appreciated as I've been struggling with this for days (again, I'm a noob :') ).
Replace line
post_info["MOST COMMON HASHTAGS"] = most_common
with:
for i, counter_tuple in enumerate(most_common):
tag_name = counter_tuple[0].replace('#','')
label = "Top %d" % (i + 1)
post_info[label] = tag_name
There's also a bit of code missing. For example, your code doesn't include csv_headers variable, which I suppose would be
csv_headers = post_info.keys()
It also seems that you're opening a file to write just one row. I don't think that's intended, so what you would like to do is to collect the results into a list of dictionaries. A cleaner solution would be to use pandas' dataframe, which you can output straight into a csv file.
most_common being the output of the call to hashtags_counted.most_common, I had a look at the doc here: https://docs.python.org/2/library/collections.html#collections.Counter.most_common
Output if formatted the following : [(key, value), (key, value), ...] and ordered in decreasing importance of number of occurences.
Hence, to get only the name and not the number of occurence, you should replace:
post_info["MOST COMMON HASHTAGS"] = most_common
by
post_info["MOST COMMON HASHTAGS"] = [x[0] for x in most_common]
You have a list of tuple. This statement builds on the fly the list of the first element of each tuple, keeping the sorting order.

Txt file to excel conversion in python

I'm trying to convert text file to excel sheet in python. The txt file contains data in the below specified formart
Column names: reg no, zip code, loc id, emp id, lastname, first name. Each record has one or more error numbers. Each record have their column names listed above the values. I would like to create an excel sheet containing reg no, firstname, lastname and errors listed in separate rows for each record.
How can I put the records in excel sheet ? Should I be using regular expressions ? And how can I insert error numbers in different rows for that corresponding record?
Expected output:
Here is the link to the input file:
https://github.com/trEaSRE124/Text_Excel_python/blob/master/new.txt
Any code snippets or suggestions are kindly appreciated.
Here is a draft code. Let me know if any changes needed:
# import pandas as pd
from collections import OrderedDict
from datetime import date
import csv
with open('in.txt') as f:
with open('out.csv', 'wb') as csvfile:
spamwriter = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
#Remove inital clutter
while("INPUT DATA" not in f.readline()):
continue
header = ["REG NO", "ZIP CODE", "LOC ID", "EMP ID", "LASTNAME", "FIRSTNAME", "ERROR"]; data = list(); errors = list()
spamwriter.writerow(header)
print header
while(True):
line = f.readline()
errors = list()
if("END" in line):
exit()
try:
int(line.split()[0])
data = line.strip().split()
f.readline() # get rid of \n
line = f.readline()
while("ERROR" in line):
errors.append(line.strip())
line = f.readline()
spamwriter.writerow(data + errors)
spamwriter.flush()
except:
continue
# while(True):
# line = f.readline()
Use python-2 to run. The errors are appended as subsequent columns. It's slightly complicated the way you want it. I can fix it if still needed
Output looks like:
You can do this using the openpyxl library which is capable of depositing items directly into a spreadsheet. This code shows how to do that for your particular situation.
NEW_PERSON, ERROR_LINE = 1,2
def Line_items():
with open('katherine.txt') as katherine:
for line in katherine:
line = line.strip()
if not line:
continue
items = line.split()
if items[0].isnumeric():
yield NEW_PERSON, items
elif items[:2] == ['ERROR', 'NUM']:
yield ERROR_LINE, line
else:
continue
from openpyxl import Workbook
wb = Workbook()
ws = wb.active
ws['A2'] = 'REG NO'
ws['B2'] = 'LASTNAME'
ws['C2'] = 'FIRSTNAME'
ws['D2'] = 'ERROR'
row = 2
for kind, data in Line_items():
if kind == NEW_PERSON:
row += 2
ws['A{:d}'.format(row)] = int(data[0])
ws['B{:d}'.format(row)] = data[-2]
ws['C{:d}'.format(row)] = data[-1]
first = True
else:
if first:
first = False
else:
row += 1
ws['D{:d}'.format(row)] = data
wb.save(filename='katherine.xlsx')
This is a screen snapshot of the result.

txt to csv using Python

I have a sql dump in txt format , it looks like this way -
"Date:","8/21/2015","","Time:","16:18:38","","Name:","NC.S.RHU10.BRD"
"System Name:","NC.S.RHU10.BRD"
"Operator:","SYSTEM"
"Action:","Trend data loss"
"Comment:"," trend definition data loss occurred at 10:21:05 AM on 8/21/2015"
"Revision:","6"
"Location:",""
"Seq Number:","1278738"
" ********************************************************************************"
"Date:","8/21/2015","","Time:","16:17:17","","Name:","SC.L.SIDESHOWBOB.MBC009"
"System Name:","SC.L.SIDESHOWBOB.MBC009"
"Operator:","SYSTEM"
"Action:","FLN device return from failure"
"Comment:","Z8 RETURN from failure in Cabinet 9, Lan 3, Drop 1."
"Revision:","81"
"Location:","SC.L.SIDESHOWBOB.MBC009"
"Seq Number:","1278737"
" ********************************************************************************"
"Date:","8/21/2015","","Time:","16:17:17","","Name:","NC.S.EHU07.EAT"
"System Name:","NC.S.EHU07.EAT"
"Operator:","ITWVSIEMP01\InsightSCH"
"Action:","Trend data collection The target object could not be found on the Field"
"Panel."
"Comment:","Trend COV (0.000) Failed - The target object could not be found on the"
"Field Panel"
"Revision:","1318"
"Location:","ITWVSIEMP01"
"Seq Number:","1278735"
" ********************************************************************************"
"Date:","8/21/2015","","Time:","16:17:15","","Name:","NC.S.EHU03.TCFM"
"System Name:","NC.S.EHU03.TCFM"
"Operator:","ITWVSIEMP01\InsightSCH"
"Action:","Trend data collection"
"Comment:","COV Data Loss Detected"
"Revision:","1481"
"Location:","ITWVSIEMP01"
"Seq Number:","1278734"
" ********************************************************************************
I want to convert in column way using Python with following fields :-
"Date","Time","Name","System Name","Operator","Action","Comment","Type","Revision","Location","Seq Number"
Is there a ready function in python that does this ?
import csv
c = csv.writer(open('out.csv', 'w'), delimiter=',')
file = open('myfile.txt')
for col in file:
data = col.split('\t')
# find index "Date=0","Time=1","Name=2","System Name=3","Operator=4","Action=5","Comment=6","Type=7","Revision=8","Location=9","Seq Number=10"
c.writerow(data[0],data[1],data[2],data[3],data[4],data[5],data[6],data[7],data[8],data[9],data[10])
f.close()
I've just written a little utility here. Maybe this could help you.
I think the last line of your input file is missing a ". Please add it at the end for a uniform delimiter.
import operator
import csv
with open('path/to/input') as infile, open('path/to/output', 'w') as outfile:
data = {}
writer = csv.writer(outfile, delimiter=',')
writer.writerow(["Date","Time","Name","System Name","Operator","Action","Comment","Revision","Location","Seq Number"])
fields = operator.itemgetter("Date","Time","Name","System Name","Operator","Action","Comment","Revision","Location","Seq Number")
for line in infile:
if line.startswith('" *'):
try:
writer.writerow(fields(data))
except AttributeError:
print('malformed input')
raise
data = {}
continue
parts = line.split(',')
if line.startswith('"Date'):
data['Date'] = parts[1]
data['Time'] = parts[4]
data['Name'] = parts[-1]
continue
name = parts[0].strip('"').rstrip(":")
value = parts[1].strip('"')
data[name] = value
The following script should work, it generates your header fields automatically and preserves the order in the CSV file, as such it should still work if the format changes a bit:
import csv
with open("sqldump.txt", "r") as f_input, open("output.csv", "wb") as f_output:
csv_input = csv.reader(f_input)
csv_output = csv.writer(f_output)
headers = []
for cols in csv_input:
if len(cols) > 1:
headers.extend([header.strip(":") for header in cols if header.endswith(':')])
else:
break
csv_output.writerow(headers)
f_input.seek(0)
entry = []
for cols in csv_input:
if cols[0] == 'Date:':
entry.extend([cols[1], cols[4], cols[-1]])
elif len(cols) > 1:
entry.append(cols[1])
elif cols[0].startswith(' *'):
csv_output.writerow(entry)
entry = []
This would give you an output CSV file looking like:
Date,Time,Name,System Name,Operator,Action,Comment,Revision,Location,Seq Number
8/21/2015,16:18:38,NC.S.RHU10.BRD,NC.S.RHU10.BRD,SYSTEM,Trend data loss, trend definition data loss occurred at 10:21:05 AM on 8/21/2015,6,,1278738
8/21/2015,16:17:17,SC.L.SIDESHOWBOB.MBC009,SC.L.SIDESHOWBOB.MBC009,SYSTEM,FLN device return from failure,"Z8 RETURN from failure in Cabinet 9, Lan 3, Drop 1.",81,SC.L.SIDESHOWBOB.MBC009,1278737
8/21/2015,16:17:17,NC.S.EHU07.EAT,NC.S.EHU07.EAT,ITWVSIEMP01\InsightSCH,Trend data collection The target object could not be found on the Field,Trend COV (0.000) Failed - The target object could not be found on the,1318,ITWVSIEMP01,1278735
8/21/2015,16:17:15,NC.S.EHU03.TCFM,NC.S.EHU03.TCFM,ITWVSIEMP01\InsightSCH,Trend data collection,COV Data Loss Detected,1481,ITWVSIEMP01,1278734
Tested using Python 2.7. If you are using Python 3.0, change the code to open("output.csv", "w", newline="")
Note, there is no 'Type' field in your example data?

Extract sequences from a FASTA file to multiple files, file based on header_IDs in a separate file

I am looking for a python solution to extract multiple sequences from a FASTA file into multiple files, based on a match to a list of header ID's in a separate file.
This is slightly more complex version of the problem posted on Extract sequences from a FASTA file based on entries in a separate file and https://www.biostars.org/p/2822/ which only output a single file for all matches.
I am new to python and am trying to find a way to:
Take a file containing strings that will be in the fasta headers
Have all records that match to a string, written to a separate fasta file
header_ID_strings file looks like this:
CAP357_2030_09WPI, CAP357_2040_11WPI, CAP357_2050_13WPI, etc...
a sample of my fasta file looks like this:
>CAP357_2030_009wpi_v1v3_1_056_00002_000.4
GTAAAATTAACCCCACTCTGTGTCACTCTAAATTGTACAACTGCAAAGGG
>CAP357_2040_011wpi_v1v3_1_008_00006_001.1
GTAAAATTAACCCCACTCTGTGTCACTCTAAATTGTACAACTGCAAAGGGT
>CAP357_2040_011wpi_v1v3_1_030_00002_000.4
GTAAAATTAACCCCACTCTGTGTCACTCTAAATTGTACAACTGCAAAGGGT
>CAP357_2040_011wpi_v1v3_1_004_00001_000.2
GTAAAATTAACCCCACTCTGTGTCACTCTAAATTGTACAACTGCAAAGGGT
>CAP357_2050_013wpi_v1v3_1_047_00002_000.4
GTAAAATTAACCCCACTCTGTGTCACTCTAAATTGTACAACTGCAAAGGGT
expected output
file1: CAP357_2030_009wpi_v1v3.fasta
>CAP357_2030_009wpi_v1v3_1_056_00002_000.4
GTAAAATTAACCCCACTCTGTGTCACTCTAAATTGTACAACTGCAAAGGG
file2: CAP357_2040_011wpi_v1v3.fasta
>CAP357_2040_011wpi_v1v3_1_008_00006_001.1
GTAAAATTAACCCCACTCTGTGTCACTCTAAATTGTACAACTGCAAAGGGT
>CAP357_2040_011wpi_v1v3_1_030_00002_000.4
GTAAAATTAACCCCACTCTGTGTCACTCTAAATTGTACAACTGCAAAGGGT
>CAP357_2040_011wpi_v1v3_1_004_00001_000.2
GTAAAATTAACCCCACTCTGTGTCACTCTAAATTGTACAACTGCAAAGGGT
etc...
This code is from the above link, but I want to have:
* matches written to separate outfiles
* I don't have to specify each outfile separately, if possible (I will have up to 30 outfiles)
#!/usr/bin/env python
import sys
from Bio import SeqIO
input_file = sys.argv[1]
id_file = sys.argv[2]
output_file = sys.argv[3]
wanted = set(line.rstrip("\n").split(None,1)[0] for line in open(id_file))
print "Found %i unique identifiers in %s" % (len(wanted), id_file)
index = SeqIO.index(input_file, "fasta")
records = (index[r] for r in wanted)
count = SeqIO.write(records, output_file, "fasta")
assert count == len(wanted)
print "Saved %i records from %s to %s" % (count, input_file, output_file)
So far this is what I have come up with (script below), but don't know how to get around manually specifying all the outfiles and variables (I have only included three here)
from Bio import SeqIO
import pandas as pd
import sys
input_file = sys.argv[1]
id_file = sys.argv[2]
output_file2020 = sys.argv[3]
output_file2030 = sys.argv[4]
output_file2040 = sys.argv[5]
colnames = ["2020", "2030", "2040"]
headerlist = pd.read_csv(id_file, names = colnames, header = None)
infile = list(SeqIO.parse(input_file, "fasta"))
2020_seq = tuple(headerlist.2020)
2030_seq = tuple(headerlist.2030)
2040_seq = tuple(headerlist.2040)
count2020 = 0
count2030 = 0
count2040 = 0
for record in infile:
if record.id in 2020_seq:
SeqIO.write([record], output_file2020, "fasta")
countSU += 1
elif record.id in PI_seq:
SeqIO.write([record], output_file2030, "fasta")
countPI += 1
elif record.id in REC_seq:
SeqIO.write([record], output_file2040, "fasta")
countREC += 1
else:
print("no matches found")
print("number of SU is", count2020)
print("number of PI is", count2030)
print("number of REC is", count2040)
For the sake of completeness, here is the 'final' script:
#!/usr/bin/env python
# a script to extract fasta records from a fasta file to multiple separate fasta files based on a particular ID (time point) in a particular field, for a given delimiter
# to run, navigate to file location with command prompt and enter: python split_fasta_by_collections.py infile.fasta
from Bio import SeqIO
import os
import sys
records = SeqIO.parse(sys.argv[1], "fasta")
collected = {}
for record in records:
descr = record.description.split("_")[1] # "_" sets the delimeter, "1" sets the field where counting starts at 0 for the first field
try:
collected[descr].append(record)
except KeyError:
collected[descr] = [record ,]
file_name = "outfile%s.fasta"
file_path = os.getcwd() #sets the output file path to your current working directory
for (descr, records) in collected.items():
with open(os.path.join(file_path, file_name % descr), 'w') as f:
SeqIO.write(records, f, 'fasta')
A couple brief suggestions:
If all your headers follow the same pattern, then you can extract the unique elements:
record.description.split("_")[1]
(yields "2040" from "CAP357_2040_011wpi_v1v3_1_008_00006_001.1")
If you use a dict you can assemble collections of records:
collected = {}
for record in records:
descr = record.description.split("_")[1]
try:
collected[descr].append(record)
except KeyError:
collected[descr] = [record ,]
Then you can write out each collection to a new file:
file_name = "outfile%s"
for (descr, records) in collected.items(): # iteritems in python2
with open(os.path.join(file_path, file_name % descr), 'w') as f:
SeqIO.write(records, f, 'fasta')

Reading comma separated values from text file in python

I have a text file consisting of 100 records like
fname,lname,subj1,marks1,subj2,marks2,subj3,marks3.
I need to extract and print lname and marks1+marks2+marks3 in python. How do I do that?
I am a beginner in python.
Please help
When I used split, i got an error saying
TypeError: Can't convert 'type' object to str implicitly.
The code was
import sys
file_name = sys.argv[1]
file = open(file_name, 'r')
for line in file:
fname = str.split(str=",", num=line.count(str))
print fname
If you want to do it that way, you were close. Is this what you were trying?
file = open(file_name, 'r')
for line in file.readlines():
fname = line.rstrip().split(',') #using rstrip to remove the \n
print fname
Note: its not a tested code. but it tries to solve your problem. Please give it a try
import csv
with open(file_name, 'rb') as csvfile:
marksReader = csv.reader(csvfile)
for row in marksReader:
if len(row) < 8: # 8 is the number of columns in your file.
# row has some missing columns or empty
continue
# Unpack columns of row; you can also do like fname = row[0] and lname = row[1] and so on ...
(fname,lname,subj1,marks1,subj2,marks2,subj3,marks3) = *row
# you can use float in place of int if marks contains decimals
totalMarks = int(marks1) + int(marks2) + int(marks3)
print '%s %s scored: %s'%(fname, lname, totalMarks)
print 'End.'
"""
sample file content
poohpool#signet.com; meixin_kok#hotmail.com; ngai_nicole#hotmail.com; isabelle_gal#hotmail.com; michelle-878#hotmail.com;
valerietan98#gmail.com; remuskan#hotmail.com; genevieve.goh#hotmail.com; poonzheng5798#yahoo.com; burgergirl96#hotmail.com;
insyirah_powergals#hotmail.com; little_princess-angel#hotmail.com; ifah_duff#hotmail.com; tweety_butt#hotmail.com;
choco_ela#hotmail.com; princessdyanah#hotmail.com;
"""
import pandas as pd
file = open('emaildump.txt', 'r')
for line in file.readlines():
fname = line.split(';') #using split to form a list
#print(fname)
df1 = pd.DataFrame(fname,columns=['Email'])
print(df1)

Categories

Resources