I keep getting the error: "ValueError: too many values to unpack (expected 12)". I've checked the dataset, counted all the variables at least 10 times and I'm banging my head against the wall....What am I doing wrong?
Code (Python 3.7):
class Corona(object):
def __init__(self, FIPS, County, State, Country, Updated, Latitude, Longitude,
Confirmed, Deaths, Recovered, Active, Combined_Key):
self.FIPS = FIPS
self.County = County
self.State = State
self.Country = Country
self.Updated = Updated
self.Latitude = Latitude
self.Longitude = Longitude
self.Confirmed = Confirmed
self.Deaths = Deaths
self.Recovered = Recovered
self.Active = Active
self.Combined_Key = Combined_Key
def __str__(self):
return self.FIPS + "/" + \
self.County + "/" + \
self.State + "/" + \
self.Country + "/" + \
self.Updated + "/" + \
self.Latitude + "/" +\
self.Longitude + "/" + \
self.Confirmed + "/" + \
self.Deaths + "/" + \
self.Recovered + "/" + \
self.Active + "/" + \
self.Combined_Key
def loadPeople(filename, slist):
ifile = open(filename, 'r')
for line in ifile:
FIPS, \
County, \
State, \
Country, \
Updated, \
Latitude, \
Longitude, \
Confirmed, \
Deaths, \
Recovered, \
Active, \
Combined_Key, \
= line.split(',')
s = Corona(FIPS, County, State, Country, Updated, Latitude, Longitude,
Confirmed, Deaths, Recovered, Active, Combined_Key)
slist.append(s)
def main():
filename = "CoronaVirus.txt"
sick = []
loadPeople(filename, sick)
if __name__ == ("__main__"):
main()
First 3 lines of dataset
45001,Abbeville,South Carolina,US,2020-04-01 21:58:49,34.22333378,-82.46170658,4,0,0,0,"Abbeville, South Carolina, US"
22001,Acadia,Louisiana,US,2020-04-01 21:58:49,30.295064899999996,-92.41419698,47,1,0,0,"Acadia, Louisiana, US"
51001,Accomack,Virginia,US,2020-04-01 21:58:49,37.76707161,-75.63234615,7,0,0,0,"Accomack, Virginia, US"
You have commas inside some of your quoted fields, so those fields are being split on the commas
s = '45001,Abbeville,South Carolina,US,2020-04-01 21:58:49,34.22333378,-82.46170658,4,0,0,0,"Abbeville, South Carolina, US"'
len(s.split(','))
# 14
Instead of manually splitting the lines, use the csv module:
import csv
with ifile as open(filename):
reader = csv.reader(ifile)
for line in reader: # line is a list
slist.append(Corona(*line))
Actually from your example line.split(',') is not the right thing as your split split in 14 and not in 12. When you split in this way you are also splitting inside the double quotes chars. When you split the following line like this the length will be 14, which is not what you want.
45001,Abbeville,South Carolina,US,2020-04-01 21:58:49,34.22333378,-82.46170658,4,0,0,0,"Abbeville, South Carolina, US"
You can modify the definition of loadPeople with
def loadPeople(filename, slist):
ifile = open(filename, 'r')
for line in ifile:
FIPS, \
County, \
State, \
Country, \
Updated, \
Latitude, \
Longitude, \
Confirmed, \
Deaths, \
Recovered, \
Active, \
Combined_Key, \
= line.split('"')[0].split(',')[:-1]+[line.split('"')[1]]
s = Corona(FIPS, County, State, Country, Updated, Latitude, Longitude,
Confirmed, Deaths, Recovered, Active, Combined_Key)
slist.append(s)
Simple problem. but might be hard to eliminate. When you split the line by commas, you also split that last item which has the full location into three parts. Therefore, you get two extra values ("Abbeville, South Carolina, US") becomes three items. Try
for line in ifile:
FIPS, \
County, \
State, \
Country, \
Updated, \
Latitude, \
Longitude, \
Confirmed, \
Deaths, \
Recovered, \
Active, \
Combined_Key1, \
Combined_Key2, \
Combined_Key3 \
= line.split(',')
And then just:
Combined_Key = ",".join(Combined_Key1, Combined_Key2, Combnied_Key3)
Though since it seems the combined key is just the County, State, and Country joined together, you could also try just using those three joined as your combined key. Be warned though, if any of your lines has a differently formatted combined-key, you will need a more sophisticated approach.
Additionally, if you are sure this file is secure, you could just use csv to get a list of variables though it takes more time to learn.
Related
I am trying to insert 30 Millions records from Data-bricks to azure SQL. SPN ID validity is 65 mins, so single insertion is not happening. I am trying to insert data with batches each batch has 2M records and I am generating new token for each batch, but still I am getting same error after 4 batches (after inserting 6M records (after 1:30 hrs it's failing)).
Error : Token Expired
table_name = "TABLE NAME"
if count <= 2000000:
access_token,connection_string = Service_Principal()
df.write.format("jdbc") \
.mode("append") \
.option("url", jdbcUrl) \
.option("dbtable", table_name) \
.option("accessToken", access_token) \
.option("encrypt", "true") \
.option("hostNameInCertificate", "") \
.option("driver","")\
.save()
else:
chunk=2000000
id1 = 0
id2 = chunk
c =count
while id1 < c:
print("Insertion STARTED at : "+ str(datetime.datetime.now()))
stop_df = final_df.filter( (final_df.id_tmp < id2) & (final_df.id_tmp >= id1))
access_token,connection_string = Service_Principal()
df.write.format("jdbc") \
.mode("append") \
.option("url", jdbcUrl) \
.option("dbtable", table_name) \
.option("accessToken", access_token) \
.option("encrypt", "true") \
.option("hostNameInCertificate", "") \
.option("driver","")\
.save()
print("Insertion COMPLETED at : "+ str(datetime.datetime.now()))
id1+=chunk
id2+=chunk
How we can close JDBC connection in each Batch or how to delete SPN ID in each Batch
Hello I'm trying to Oracle Partitioned table to Datalake parquet file.
Using this script
# Convert it to Spark SQL table and save it as parquet format
df.write \
.format("parquet") \
.option("path","/archive/" + schema_name + "/" + table_name + ".parquet") \
.mode("append") \
.saveAsTable(table_name)
This code get all data of table not partition.
spark = SparkSession.builder \
.appName("Load " + schema_name + " " + table_name + " from Oracle into Parquet and creating Table") \
.getOrCreate()
This one is Creating table from Oracle
How can i get this only parquet :)
you said when you select from the table you are getting data from the table and you want from a particular partition. Did you try putting the partition name using the syntax PARTION(partition_name).
How many partitions do you have, if they are not too many then you can try creating a view for each partition and then selecting data from the view.
Created table in oracle named Checkes
Add partition name to table
after I can read this partition name from Spark.
query = '(select partition_name from Schema.checkes c) checkes'
df = spark.read \
.format("jdbc") \
.option("url","jdbc:oracle:thin:#" + db_host + ":" + db_port + "/" + db_service) \
.option("dbtable",query) \
.option("user",db_user) \
.option("password",db_pass) \
.option("driver","oracle.jdbc.OracleDriver") \
.option("encoding","UTF-8") \
.option("fetchSize", 10000) \
.option("numPartitions",40) \
.load()
print("part count: " + str(df.count()))
if df.count() > 0:partition_name = df.select("partition_name").collect()[0]["partition_name"]
df1 = spark.read \
.format("jdbc") \
.option("url","jdbc:oracle:thin:#" + db_host + ":" + db_port + "/" + db_service) \
.option("dbtable",query1) \
.option("user",db_user) \
.option("password",db_pass) \
.option("driver","oracle.jdbc.OracleDriver") \
.option("encoding","UTF-8") \
.option("fetchSize", 10000) \
.option("numPartitions",40) \
.load()
```
I'm working on a Python 3 script that among other things, at some point it needs to create a .JKS or .P12 keystore. I use to have a bash script that used keytool for this:
keytool -genkey -keyalg RSA -alias certAlias \
-keystore keystore.jks -storepass $keyPass \
-validity 360 -keysize 2048 \
-noprompt -dname "CN=com.myCompany, OU=ID, O=AwesomeSoft, L=SF, S=CA, C=US" \
-keypass $keyPass
mv ./keystore.jks src/main/resources/
Now i'm moving the same functionality from that bash script to python and I having some issues to figure it out and any pointer will ne more than welcome.. you may noticed that the example above is for jks, not p12... the newer version have to be able to, depending on a variable before called certType with create one or the other... or create a jks and later convert it to p12... i'm open to options..
Thanks in advance!!
Found my answer:
import os
certAlias = 'cert'
certAlg = 'RSA'
certSigAlg = 'SHA1withRSA'
certExp = '365'
certKeySize = '2048'
certKeyType = 'PKCS12' # Select PKCS12 or JKS
certKeyPass = 'password123'
fileName = 'keystore'
dname = 'CN=mySite.com'
#
if certKeyType == "PKCS12":
fileExt = 'p12'
elif certKeyType == "JKS":
fileExt = 'jks'
certFile = fileName + '.' + fileExt
keytool = 'keytool -genkey -noprompt \
-alias ' + certAlias + ' \
-keypass ' + certKeyPass + ' \
-keyalg ' + certAlg + ' \
-sigalg ' + certSigAlg + '\
-validity ' + certExp + ' \
-dname ' + dname + ' \
-keysize ' + certKeySize + ' \
-keystore ' + certFile + ' \
-storepass '+ certKeyPass +' \
-storetype ' + certKeyType
os.system(keytool)
I did this and works but I will be playing to add more logic... hope it helps anyone.
Does anyone knows why I have this error on output of my Python document? I'm using lxml element etree.
The error shows up when I define the functions
...............................................................................................................................................................
List all purchases of the year chosen by the user:
Buy: 2017/12/01, Delivered, Ship all over as each product is available.
Traceback (most recent call last):
File "D: \ ABC \ 1stan \ 1st Semester \ TI \ tp3 \ python \ ola.py", line
55, in <module>
printCompras (treeDoc, "2017", prefix)
File "D: \ ABC \ 1styear \ 1st Semester \ TI \ tp3 \ python \ ola.py",
line 20, in printCompras
printCompra (compraElem, prefix)
File "D: \ ABC \ 1styear \ 1st Semester \ TI \ tp3 \ python \ ola.py",
line 34, in printCompra
"+", "+ buy_id [0] .text +", "+ buy_id [0] .text +", "+ buy_id [0] .text
+ )
IndexError: list index out of range
My Python file:
from lxml import etree
file = "loja.xml"
treeDoc = etree.parse (file)
buyElem = treeDoc.xpath ("// purchase")
listUsers = treeDoc.xpath ("// user")
productElem = treeDoc.xpath ("// product")
characteristicaElem = treeDoc.xpath ("// product / technical_characters /
feature")
olamundo = treeDoc.xpath ("// purchase / product_bought")
print ("List all Purchases of the year chosen by the user:")
prefix = ""
def printCompras (treeDoc, year, prefix):
for purchase in compraElem:
buy_date = purchase.xpath ("purchase_date")
ano_data = (buy_date [0] .text) .split ("/")
buy_year = year_data [0]
if year == year_of_buy:
printCompra (compraElem, prefix)
def printCompra (compraElem, prefix):
for purchase in compraElem:
buyer_id = buyElem [0] .get ("buyer")
for user in listaUsers:
user_name = user.xpath ("# user_id =" + buyer_id + "/ name")
compra_id = compra.xpath ("compra_id")
buy_date = purchase.xpath ("purchase_date")
buy_account = buy.xpath ("buy_account") [0]
indication = purchase.xpath ("indication")
for purchase in compraElem:
print (prefix + "Buy:" + buy_date [0] .text + "," + buy_date.get
("situation") + "," + indication [0] .text)
"+", "+ buy_id [0] .text +", "+ buy_id [0] .text +", "+ buy_id [0]
.text + )
(+), "+", "+", "+", "+", "+", "+", "[ )
for product in productElem:
carrier = olamundo [0] .get ("carrier")
printProduct (productName, carrier, prefix)
def printProduct (productEle, carrier, prefix):
product_id = olamundo [0] .get ("# product_id")
product_name = treeDoc.xpath ("// buy [purchased_product / # product_id =]")
price = treeDoc.xpath ("// product bla bla bla")
print (prefix + "Product:" + product_id + "," + name [0] .text + "," + price
[0] .text + "," +
print (prefix + "Product:" + product_id + "," + olamundo [0] .get ("#
carrier"))
def printVender ():
identifier, name, average-rating-in-stars
def printCaracteristica (characteristicElement, prefix):
name = characterElement.get ("name")
value = characterElement.get ("value")
print (prefix + "Characteristics:" + name + "-" + value)
printCompras (treeDoc, "2017", prefix)
I'm using spotlight via pyobjc. Which is working well except for when I try to limit the time period using kMDItemContentCreationDate. I think the issue is with my time format. Any help would be greatly appreciated.
from Cocoa import *
import sys
emails = [sys.argv[1], ]
predicate = "(kMDItemContentType = 'com.apple.mail.emlx') && (" + \
'||'.join(["((kMDItemAuthorEmailAddresses = '%s'))" % m for m in emails]) + \
"&& (kMDItemContentCreationDate > '2011-03-23 00:00:00')" + \
"&& (kMDItemContentCreationDate < '2012-03-24 00:00:00')" + \
")"
query = NSMetadataQuery.alloc().init()
query.setPredicate_(NSPredicate.predicateWithFormat_(predicate))
query.setSortDescriptors_(NSArray.arrayWithObject_(NSSortDescriptor.alloc().initWithKey_ascending_('kMDItemContentCreationDate', False)))
query.startQuery()
NSRunLoop.currentRunLoop().runUntilDate_(NSDate.dateWithTimeIntervalSinceNow_(5))
query.stopQuery()
results = query.results()[:5]
for item in results:
print "subject: ", item.valueForAttribute_("kMDItemSubject")