I am working on a Python script to do some steps.
Right now everything is working except the part that it has to go through all table rows and save each line as a JSON object. The problem is: It only saves the last line. So it's not saving the previous lines. I know where is the problem but don't know how to fix.
Here is the code:
url = 'http://website.com/group1.html'
htmlFile= urlopen(urlGroup1)
soup= BeautifulSoup(htmlGroup1, 'html.parser')
table = soup2.find_all("table", {"class": "sortable employeeList result-table"})[0]
rows = table.find_all('tr')
Filegroup1 = open('group1.csv', 'wt+')
Datagroup1 = csv.writer(Filegroup1)
jsonFilePath = r'group1.json'
def make_json(Filegroup1, jsonFilePath):
data2 = {}
with open('group1.csv', encoding='utf-8') as csvf:
csvReader = csv.DictReader(csvf)
for rows in csvReader:
data2 = rows
with open(jsonFilePath, 'w', encoding='utf-8') as jsonf:
jsonf.write(simplejson.dumps([data2], indent=4))
try:
for row in rows:
FilteredRow = []
for cell in row.find_all(['td', 'th']):
FilteredRow.append(cell.get_text().strip())
Datagroup1.writerow(FilteredRow)
finally:
Filegroup1.close()
make_json(Filegroup1, jsonFilePath)
The issue is here:
for rows in csvReader:
data2 = rows
If I change it to the following, it will work!! But it will group each object by Employee ID. Which I don't want that.
for rows in csvReader:
key = rows['Employee Name']
data3[key] = rows
Related
#gets rid of spaces in existing csv headers
def getColumns(readCSV):
return [column.replace(' ','') for column in next(readCSV)]
#insert format used to avoid hard-coded headers in script
def insertData(tableName, columns, readCSV):
print("Inserting Data")
query = 'INSERT INTO {}({}) VALUES ({})'.format(
tableName,
','.join(columns),
','.join('?' * len(columns))
)
for data in readCSV:
cursor.execute(query, data)
con.commit()
def updateTable(csvPath, tableName):
print("Updating table...")
print("Reading file contents and uploading into db table...")
## insert timestamp column into existing csv. Does not incorporate header correctly for timestamp
rows = []
with open(csvPath, 'r', newline='') as csvFile:
readCSV = csv.reader(csvFile, delimiter=',')
for row in readCSV:
rows.append(row)
with open(csvPath, 'w', newline='')as writeFile:
file_write = csv.writer(writeFile)
for val in rows:
timestamp = datetime.now()
val.insert(0, timestamp)
file_write.writerow(val)
with open(csvPath) as csvFile:
readCSV = csv.reader(csvFile, delimiter=',')
columns = getColumns(readCSV)
insertData(tableName, columns, readCSV)
print ("Upload complete")
Above is a snippet of the code I'm working on. I am gathering data from a csv to insert into a SQL database. Currently, the csv does not have a timestamp column and without that, the import wont work as it sees duplicate data. I found a solution at https://www.geeksforgeeks.org/how-to-add-timestamp-to-csv-file-in-python/ for adding a timestamp column and have incorporated it into the code, but it does not add a header for the column. I'm sure it's an easy fix, but I am new to python and cant find the solution anywhere else. Also, if you see something inefficient with the coding in updateTable, let me know so that I can recognize it and learn a better way.
I ended up using a different method with import pandas as pd
def updateTable(csvPath, tableName):
print("Updating table...")
print("Reading file contents and uploading into db table...")
timestamp = datetime.now()
df = pd.read_csv(csvPath)
df.insert(0, 'timestamp',timestamp)
df.to_csv(csvPath, index=False)
with open(csvPath) as csvFile:
readCSV = csv.reader(csvFile, delimiter=',')
columns = getColumns(readCSV)
insertData(tableName, columns, readCSV)
print ("Upload complete")
I have some code that scrapes several tables from webpages and then puts the data into several excel files.
I want to also be able to add the company name at the bottom of the excel file. I have worked out how to get the name of the company using companyname = soup.find('h1').text as shown in the first code block below.
One of the excel files is generated from the following code lines:
all_data = {}
#for every table found on the page
for table in soup.select('table.BordCollapseYear2'):
table_name = table.find_previous('b').text
all_data[table_name] = []
#scrape for every row
for tr in table.select('tr'):
row = [td.get_text(strip=True, separator=' ') for td in tr.select('td')]
if len(row) == 7:
all_data[table_name].append(row)
companyname = soup.find('h1').text
with open('data2.csv', 'a', newline='') as csvfile:
spamwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for row in all_data:
spamwriter.writerow(row)
I tried adding the line writerows(companyname) this worked but it separated out each letter. So I think I am nearly there...
Put [] around companyname in writerow().
For example:
with open('data2.csv', 'a', newline='') as csvfile:
spamwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for row in all_data:
spamwriter.writerow(row)
spamwriter.writerow([companyname]) # <-- notice the `[]` around companyname
I am trying to write to each columns the results of my queries, however when I run my below script it seems to format with the data not appearing in the column next to it displaying all the rows.
results = []
results2 = []
results3 = []
results4 = []
results5 = []
results6 = []
cur.execute(dbQuery)
results.extend(cur.fetchall())
cur.execute(dbQuery2)
results2.extend(cur.fetchall())
cur.execute(dbQuery3)
results3.extend(cur.fetchall())
cur.execute(dbQuery4)
results4.extend(cur.fetchall())
cur.execute(dbQuery5)
results5.extend(cur.fetchall())
cur.execute(dbQuery6)
results6.extend(cur.fetchall())
with open("out.csv", "wb") as csv_file:
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Query1', 'Query2', 'Query3', 'Query4', 'Query5', 'Query6'])
csv_writer.writerow(results, results2, results3, results4, results5, results6)
You have to iterate the results of all the query, then you need to write the CSV file. I assumed all the results from DB like below.
import csv
results = [1,2,3,4]
results2 = [11,12,13,14]
results3 = [21,22,23,24]
results4 = [31,32,33,34]
results5 = [41,42,43,44]
results6 = [51,52,53,54]
with open('out.csv', 'wb') as csv_file:
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Query1', 'Query2', 'Query3', 'Query4', 'Query5', 'Query6'])
csv_writer.writerows(zip(*[results, results2, results3, results4, results5, results6]))
I'm trying to convert text file to excel sheet in python. The txt file contains data in the below specified formart
Column names: reg no, zip code, loc id, emp id, lastname, first name. Each record has one or more error numbers. Each record have their column names listed above the values. I would like to create an excel sheet containing reg no, firstname, lastname and errors listed in separate rows for each record.
How can I put the records in excel sheet ? Should I be using regular expressions ? And how can I insert error numbers in different rows for that corresponding record?
Expected output:
Here is the link to the input file:
https://github.com/trEaSRE124/Text_Excel_python/blob/master/new.txt
Any code snippets or suggestions are kindly appreciated.
Here is a draft code. Let me know if any changes needed:
# import pandas as pd
from collections import OrderedDict
from datetime import date
import csv
with open('in.txt') as f:
with open('out.csv', 'wb') as csvfile:
spamwriter = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
#Remove inital clutter
while("INPUT DATA" not in f.readline()):
continue
header = ["REG NO", "ZIP CODE", "LOC ID", "EMP ID", "LASTNAME", "FIRSTNAME", "ERROR"]; data = list(); errors = list()
spamwriter.writerow(header)
print header
while(True):
line = f.readline()
errors = list()
if("END" in line):
exit()
try:
int(line.split()[0])
data = line.strip().split()
f.readline() # get rid of \n
line = f.readline()
while("ERROR" in line):
errors.append(line.strip())
line = f.readline()
spamwriter.writerow(data + errors)
spamwriter.flush()
except:
continue
# while(True):
# line = f.readline()
Use python-2 to run. The errors are appended as subsequent columns. It's slightly complicated the way you want it. I can fix it if still needed
Output looks like:
You can do this using the openpyxl library which is capable of depositing items directly into a spreadsheet. This code shows how to do that for your particular situation.
NEW_PERSON, ERROR_LINE = 1,2
def Line_items():
with open('katherine.txt') as katherine:
for line in katherine:
line = line.strip()
if not line:
continue
items = line.split()
if items[0].isnumeric():
yield NEW_PERSON, items
elif items[:2] == ['ERROR', 'NUM']:
yield ERROR_LINE, line
else:
continue
from openpyxl import Workbook
wb = Workbook()
ws = wb.active
ws['A2'] = 'REG NO'
ws['B2'] = 'LASTNAME'
ws['C2'] = 'FIRSTNAME'
ws['D2'] = 'ERROR'
row = 2
for kind, data in Line_items():
if kind == NEW_PERSON:
row += 2
ws['A{:d}'.format(row)] = int(data[0])
ws['B{:d}'.format(row)] = data[-2]
ws['C{:d}'.format(row)] = data[-1]
first = True
else:
if first:
first = False
else:
row += 1
ws['D{:d}'.format(row)] = data
wb.save(filename='katherine.xlsx')
This is a screen snapshot of the result.
I am writing a scraper and I want loop through a list of links and merge all results as columns to a dataframe on same key (like a left join).
I run this code in the Ipython Notebook, the resulting csv that comes from the dataframe does not make sense, however if after running the script I merge df and df2 on the mutual column "questions", I get the join that I need, but in the script there's something wrong.
Here's the whole script, there's log in with requests but you don't have to make a user, you can run it as a guest and you just won't get all answers in the review.
import requests
from bs4 import BeautifulSoup as bs
import csv
import pandas as pd
get_url = 'https://www.g2crowd.com/login?form=login'
post_url = 'https://www.g2crowd.com/user_sessions'
review_url = 'https://www.g2crowd.com/survey_responses/salesforce-crm-review-29972'
links = []
with open("links.csv", "r") as f:
spamreader = csv.reader(f, delimiter=',')
for row in spamreader:
links.append(row)
links = links[1:]
s = requests.Session()
r = s.get(get_url)
soup = bs(r.text)
token = soup.select('input[name="authenticity_token"]')[0]['value']
username = 'email#gmail.com'
password = 'password'
payload = {"user_session[login_email]": "email#gmail.com", "user_session[password]": "password"}
payload['authenticity_token'] = token
Referer = dict(Referer=get_url)
r = s.post(post_url, data=payload, headers=Referer)
print r.status_code
df = pd.read_csv("data.csv")
#df = df.set_index('questions')
for link in links:
r = s.get(link[0])
soup = bs(r.text)
title = soup.title.contents[0]
question_wrapper = soup.findAll("div", class_="question-wrapper")
print len(question_wrapper)
questions = []
answers = []
scraped_titles = []
tricky_title = 'Salesforce CRM Review by G2 Crowd User in Transportation/Trucking/Railroad - March 13, 2013'
with open("scraped_titles.csv", "r") as f:
spamreader = csv.reader(f, delimiter=',')
for row in spamreader:
scraped_titles.append(row[0])
scraped_titles = set(scraped_titles)
if (title not in scraped_titles and title != tricky_title):
for question in question_wrapper:
q = question.label.contents[0]
a = question.div.contents[0].text
questions.append(q)
answers.append(a)
#qa = zip(questions, answers)
qa = dict(questions=questions, answers=answers)
df2 = pd.DataFrame(qa)
#df2 = df2.set_index('questions', inplace=True)
#df2.to_csv(title + ".csv", encoding='utf-8')
df = pd.merge(df, df2, how='left', on='questions')
with open("scraped_titles.csv", "a") as csvwriter:
spamreader = csv.writer(csvwriter, delimiter=',')
spamreader.writerow([unicode(title).encode("utf-8")])
else:
pass
df.to_csv("all_data.csv", encoding='utf-8')
I also tried to save every review to .csv and then merge everything with Pandas, but I get a weird, rare undocumented error:
Error: new-line character seen in unquoted field - do you need to open
the file in universal-newline mode?
I have been trying to find my error for a pretty while, if somebody can point it out it would be extremely helpful.
Also, I hope I have formatted the post according to the rules, if not please help me to correct it.