Get all Salesforce data lead in Python

Get all Salesforce data lead in Python - python

I want to get all the data of a lead object
and I did this script to obtain the data of a lead
and i only get an amount of 2000.
What do I have to do to get all the amount in
Salesforce of an object by pyhon?
from simple_salesforce import Salesforce
from datetime import datetime
import csv
import os
import json
import account
SALESFORCE_USERNAME = '123'
PASSWORD = '123'
SECURITY_TOKEN = '123'
def main():
# Authentication settings
sf = Salesforce(username=SALESFORCE_USERNAME,
password=PASSWORD,
security_token=SECURITY_TOKEN)
# Lead Column setting to be acquired
columns = [
"Id",
"Email",
"Company",
"Address",
"Phone",
"FirstName",
"LastName",
"CreatedDate",
]
sosl = 'SELECT {0[0]}, {0[1]}, {0[2]}, {0[3]}, {0[4]}, {0[5]}, {0[6]} , {0[7]} , {0[8]} FROM Lead '.format(
columns)
# Data acquisition with SOSL
data = sf.query(sosl)
# Delete CSV file if it exists
output_csv = 'output.csv'
if os.path.exists(output_csv):
os.remove(output_csv)
# Write to CSV file
for k, v in data.items():
if type(v) is list:
with open(output_csv, 'w', newline="") as f:
writer = csv.DictWriter(f, fieldnames=columns)
writer.writeheader()
for d in v:
data = json.loads(json.dumps(d))
del data['attributes']
d = datetime.strptime(
data['CreatedDate'], '%Y-%m-%dT%H:%M:%S.%f%z')
data['CreatedDate'] = d.strftime('%Y-%m-%d %H:%M:%S')
writer.writerow(data)
if __name__ == '__main__':
main()
If anyone knows, please let me know.

You can obtain all responsive records via the query_all() or query_all_iter() methods, which are documented under Queries in the simple_salesforce documentation.
Note that the query you are running is SOQL, not SOSL. SOSL is for full-text search.

Related

How to skip empty cells using csv.DictWriter

I am trying to anonymize data in CSV, however, I only want to do this for cells that are not empty. At present, my program adds anonymized data to all cells with the given row.
How can I skip empty the empty cells? Below is my program
import csv
from faker import Faker
from collections import defaultdict
def anonymize():
"Anonymizes the given original data to anonymized form"
faker = Faker()
names = defaultdict(faker.name)
emails = defaultdict(faker.email)
with open(filename, "r") as f:
with open(f"{filename}-anonymized_data.csv", "w") as o:
reader = csv.DictReader(f)
writer = csv.DictWriter(o, reader.fieldnames)
writer.writeheader()
for row in reader:
row["adult_First_Name"] = names[
row["adult_First_Name"]
]
row["child_First_Name"] = names[
row["child_First_Name"]
]
row["Adult - EMAIL ADDRESS"] = emails[row["Adult - EMAIL ADDRESS"]]
row["Parent - EMAIL ADDRESS"] = emails[row["Parent - EMAIL ADDRESS"]]
writer.writerow(row)
if __name__ == "__main__":
anonymize()

You could test each field before applying the fake value. A simpler approach would be to store the fields that need to be changed in a fields list along with which faker function to apply if needed:
import csv
from faker import Faker
def anonymize():
"Anonymizes the given original data to anonymized form"
faker = Faker()
fields = [
("adult_First_Name", faker.name),
("child_First_Name", faker.name),
("Adult - EMAIL ADDRESS", faker.email),
("Parent - EMAIL ADDRESS", faker.email),
]
with open(filename, "r") as f:
with open(f"{filename}-anonymized_data.csv", "w", newline="") as o:
reader = csv.DictReader(f)
writer = csv.DictWriter(o, reader.fieldnames)
writer.writeheader()
for row in reader:
for field, fake in fields:
if row[field]:
row[field] = fake()
writer.writerow(row)
if __name__ == "__main__":
anonymize()
Adding newline='' would stop extra blank lines in the output.

How do I create a loop such that I get all the queries into one csv in through python?

I have created a function that fetches price, rating, etc after it hits an API:
def is_priced(business_id):
try:
priced_ind = get_business(API_KEY, business_id)
priced_ind1 = priced_ind['price']
except:
priced_ind1 = 'None'
return priced_ind1
priced_ind = is_priced(b_id)
print(priced_ind)
Similar for rating
def is_rated(business_id):
try:
rated_ind = get_business(API_KEY, business_id)
rated_ind1 = rated_ind['rating']
except:
rated_ind1 = 'None'
return rated_ind1
However, I want my function to loop through the business names I have in my CSV file and catch all this data and export it to a new csv file with these two parameters beside the names of the business.
The CSV file has info on the name of the business along with its address,city,state,zip and country
Eg:
Name address city state zip country
XYZ(The) 5* WE 223899th St. New York NY 19921 US
My output:
Querying https://api.xyz.com/v3/businesses/matches ...
True
Querying https://api.xyz.com/v3/businesses/matches ...
4.0
Querying https://api.xyz.com/v3/businesses/matches ...
$$
Querying https://api.xyz.com/v3/businesses/matches ...
Querying https://api.xyz.com/v3/businesses/matches ...
The real issue is my output only returns business id in the csv. and the rating etc as u see is just returned in the console. how do I set a loop such that it returns for all the businesses the info i desire into a single CSV?

The csv module is useful for this sort of thing e.g.
import csv
with open('f.csv', 'r') as csvfile:
reader = csv.reader(csvfile, delimiter=',', quotechar='"')
with open('tmp.csv', 'w') as output:
writer = csv.writer(output)
for row in reader:
business_id = row[0]
row.append(get_price_index(business_id))
row.append(get_rate_index(business_id))
writer.writerow(row)

You can read the business names from the CSV file, iterate over them using a for loop, hit the API and store the results, and write to a new CSV file.
import csv
data = []
with open('businesses.csv') as fp:
# skip header line
header = next(fp)
reader = csv.reader(fp)
for row in reader:
b_name = reader[0]
# not sure how you get the business ID:
b_id = get_business_id(b_name)
p = is_priced(b_id)
r = is_rated(b_id)
out.append((b_name, p, r))
# write out the results
with open('business_data.csv', 'w') as fp:
writer = csv.writer(fp)
writer.writerow(['name', 'price', 'rating'])
for row in data:
writer.writerow(row)

You can do this easily using pandas:
import pandas as pd
csv = pd.read_csv('your_csv.csv', usecols=['business_name']) # since you only need the name
# you'll receive business_name in your functions
csv = csv.apply(is_priced, axis=1)
csv = csv.apply(is_rated, axis=1)
csv.to_csv('result.csv', index=False)
All you have to do in your functions is:
def is_priced(row):
business_name = row['business_name']
business_id = ??
...

Why influx performance is so slow

I am storing some data in influx and it is quite confusing that influx is 4-5 times slow as Mysql. I try to test by inserting 10000 rows in mysql and then in influxdb.
and the stats are below.
Mysql
real: 6m 39sec
user: 2.956sec
sys: 0.504sec
Influxdb
real: 6m 17.193sec
user: 11.860sec
sys: 0.328sec
my code for influx is given below, I used the same pattern to store in mysql.
#!/usr/bin/env python
# coding: utf-8
import time
import csv
import sys
import datetime
import calendar
import pytz
from influxdb import client as influxdb
from datetime import datetime
host = 'localhost'
port = 8086
user = "admin"
password = "admin"
db_name = "testdatabase"
db = influxdb.InfluxDBClient(database=db_name)
def read_data():
with open(file) as f:
reader = f.readlines()[4:]
for line in reader:
yield (line.strip().split(','))
fmt = '%Y-%m-%d %H:%M:%S'
file = '/home/rob/mycsvfile.csv'
csvToInflux = read_data()
body = []
for metric in csvToInflux:
timestamp = datetime.strptime(metric[0][1: len(metric[0]) - 1], fmt)
new_value = float(metric[1])
body.append({
'measurement': 'mytable1',
'time': timestamp,
'fields': {
'col1': metric[1],
'col2': metric[2],
'col3': metric[3],
'col4': metric[4],
'col5': metric[5],
'col6': metric[6],
'col7': metric[7],
'col8': metric[8],
'col9': metric[9]
}
})
db.write_points(body)
Can someone give me an idea how can I improve it. I think it might be due to cache. is cache option is off by default in Influx db? and can someone guide me to do batch processing in influx. I try to look over SO and google but couldn't solve my problem. I am newbie to influx db. I am trying to make it faster.
Thanks for any help or tips.

Inserting one by one into influxdb is slow, you should do it in batches. For example, trying with a CSV of 10000 lines (one by one):
with open('/tmp/blah.csv') as f:
lines = f.readlines()
import influxdb
inf = influxdb.InfluxDBClient('localhost', 8086, 'root', 'root', 'example1')
for line in lines:
parts = line.split(',')
json_body = [{
'measurement': 'one_by_one',
'time': parts[0],
'fields':{
'my_value': int(parts[1].strip())
}
}]
inf.write_points(json_body)
This gives me a result of
└─ $ ▶ time python influx_one.py
real 1m43.655s
user 0m19.547s
sys 0m3.266s
And doing a small change to insert all the lines of the CSV in one go:
json_body = []
for line in lines:
parts = line.split(',')
json_body.append({
'measurement': 'one_batch',
'time': parts[0],
'fields':{
'my_value': int(parts[1].strip())
}
})
inf.write_points(json_body)
The result is much much better:
└─ $ ▶ time python influx_good.py
real 0m2.693s
user 0m1.797s
sys 0m0.734s

Index CSV to ElasticSearch in Python

Looking to index a CSV file to ElasticSearch, without using Logstash.
I am using the elasticsearch-dsl high level library.
Given a CSV with header for example:
name,address,url
adam,hills 32,http://rockit.com
jane,valleys 23,http://popit.com
What will be the best way to index all the data by the fields? Eventually I'm looking to get each row to look like this
{
"name": "adam",
"address": "hills 32",
"url": "http://rockit.com"
}

This kind of task is easier with the lower-level elasticsearch-py library:
from elasticsearch import helpers, Elasticsearch
import csv
es = Elasticsearch()
with open('/tmp/x.csv') as f:
reader = csv.DictReader(f)
helpers.bulk(es, reader, index='my-index', doc_type='my-type')

If you want to create elasticsearch database from .tsv/.csv with strict types and model for a better filtering u can do something like that :
class ElementIndex(DocType):
ROWNAME = Text()
ROWNAME = Text()
class Meta:
index = 'index_name'
def indexing(self):
obj = ElementIndex(
ROWNAME=str(self['NAME']),
ROWNAME=str(self['NAME'])
)
obj.save(index="index_name")
return obj.to_dict(include_meta=True)
def bulk_indexing(args):
# ElementIndex.init(index="index_name")
ElementIndex.init()
es = Elasticsearch()
//here your result dict with data from source
r = bulk(client=es, actions=(indexing(c) for c in result))
es.indices.refresh()

Python: Reorder columns of a csv file

So I am collecting data and this data is saved into csv files, however for presentation purposes I want to reorder the columns in each respective csv file based on it's related "order".
I was using this question (write CSV columns out in a different order in Python) as a guide but I'm not sure why I'm getting the error
writeindices = [name2index[name] for name in writenames]
KeyError: % Processor Time
when I run it. Note this error doesn't seem to be limited to just the string % Processor Time'.
Where am I going wrong?
Here is my code:
CPU_order=["%"+" Processor Time", "%"+" User Time", "Other"]
Memory_order=["Available Bytes", "Pages/sec", "Pages Output/sec", "Pages Input/sec", "Page Faults/sec"]
def reorder_csv(path,title,input_file):
if title == 'CPU':
order=CPU_order
elif title == 'Memory':
order=Memory_order
output_file=path+'/'+title+'_reorder'+'.csv'
writenames = order
reader = csv.reader(input_file)
writer = csv.writer(open(output_file, 'wb'))
readnames = reader.next()
name2index = dict((name, index) for index, name in enumerate(readnames))
writeindices = [name2index[name] for name in writenames]
reorderfunc = operator.itemgetter(*writeindices)
writer.writerow(writenames)
for row in reader:
writer.writerow(reorderfunc(row))
Here is a sample of what the input CSV file looks like:
,CPU\% User Time,CPU\% Processor Time,CPU\Other
05/23/2016 06:01:51.552,0,0,0
05/23/2016 06:02:01.567,0.038940741537158409,0.62259056657940626,0.077882481554869071
05/23/2016 06:02:11.566,0.03900149141703179,0.77956981074955856,0
05/23/2016 06:02:21.566,0,0,0
05/23/2016 06:02:31.566,0,1.1695867249963632,0

Your code works. It is your data which does not have a column named "% Processor Time". Here is a sample data I use:
Other,% User Time,% Processor Time
o1,u1,p1
o2,u2,p2
And here is the code which I call:
reorder_csv('.', 'CPU', open('data.csv'))
With these settings, everything works fine. Please check your data.
Update
Now that I see your data, it looks like your have column names such as "CPU\% Processor Time" and want to translate it to "% Processor Time" before writing out. All you need to do is creating your name2index this way:
name2index = dict((name.replace('CPU\\', ''), index) for index, name in enumerate(readnames))
The difference here is instead of name, you should have name.replace('CPU\\', ''), which get rid of the CPU\ part.
Update 2
I reworked your code to use csv.DictReader and csv.DictWriter. I also assume that "CPU\% Prvileged Time" will be transformed into "Other". If that is not the case, you can fix it in the transformer dictionary.
import csv
import os
def rename_columns(row):
""" Take a row (dictionary) of data and return a new row with columns renamed """
transformer = {
'CPU\\% User Time': '% User Time',
'CPU\\% Processor Time': '% Processor Time',
'CPU\\% Privileged Time': 'Other',
}
new_row = {transformer.get(k, k): v for k, v in row.items()}
return new_row
def reorder_csv(path, title, input_file):
header = dict(
CPU=["% Processor Time", "% User Time", "Other"],
Memory=["Available Bytes", "Pages/sec", "Pages Output/sec", "Pages Input/sec", "Page Faults/sec"],
)
reader = csv.DictReader(input_file)
output_filename = os.path.join(path, '{}_reorder2.csv'.format(title))
with open(output_filename, 'wb') as outfile:
# Create a new writer where each row is a dictionary.
# If the row contains extra keys, ignore them
writer = csv.DictWriter(outfile, header[title], extrasaction='ignore')
writer.writeheader()
for row in reader:
# Each row is a dictionary, not list
print row
row = rename_columns(row)
print row
print
writer.writerow(row)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Get all Salesforce data lead in Python - python

You can obtain all responsive records via the query_all() or query_all_iter() methods, which are documented under Queries in the simple_salesforce documentation. Note that the query you are running is SOQL, not SOSL. SOSL is for full-text search.

Related

How to skip empty cells using csv.DictWriter

How do I create a loop such that I get all the queries into one csv in through python?

Why influx performance is so slow

Index CSV to ElasticSearch in Python

Python: Reorder columns of a csv file

Categories

Resources