How to convert your script into a class? - python

I wrote a code to convert some text in pdf file into a pandas dataframe. Code works very well normally, but when I try to fit it into class and define function for it, it returns with error.
import pdfplumber
import pandas as pd
import re
cols = ["Declaration Number", "Declaration Date", "Warehouse", "Quantity", "Number of boxes", "Product name", "Invoice Number"]
dataset = []
quant = []
date = []
decl_date = []
decl = re.compile(r'\d{8}AN\d{6}')
decd = re.compile(r'\d{2}\.\d{2}\.\d{4}')
whse = re.compile(r'ANTREPO | LİMAN')
qty = re.compile(r'\d.KAP')
prod = re.compile(r'Ticari')
invNo = re.compile(r'Fatura')
class pdf():
def __init__(self):
self.kap = None
self.kg = None
def FirstPage():
with pdfplumber.open("44550500AN087999.pdf") as pdf:
page = pdf.pages[0]
text = page.extract_text()
for line in text.split('\n'):
if decl.search(line):
decl_num = line.split()[-1]
if decd.search(line):
decl_date = []
date = []
decl_date.append(line.split())
date = decl_date[1][-1]
if whse.search(line):
warehouse = line.split()
if qty.search(line):
quant = line.split()
kap = quant[0] + " " + quant[1]
kg = quant[2] + " " + quant[3]
when I run it it returns with several errors:
For instance:
<ipython-input-26-bc082b4afef0> in FirstPage()
20 date = []
21 decl_date.append(line.split())
---> 22 date = decl_date[1][-1]
23 if whse.search(line):
24 warehouse = line.split()
IndexError: list index out of range
I am probably defining the variables wrong but I am a newby so, anyone have any idea what am I doing wrong?

You are only putting one element into decl_date, and then trying to access the second element inside that list, which does not exist.
Your use of line.split() seems incorrect to me. The way you have used them essentially only puts the string into a 1-element list "string" -> ["string"].
I assume you want to split the string by using the regex in each if-statement, in that case change line.split() to pattern.split(line)[index], swapping out pattern and index

Related

How to get a mean for a int variable within 6 month based on a variable dates

I would like to know how I could have the mean of notes by grouping dates within 6 months. In other words, let's say I want the notes mean for all the comments between 01/01/2020 and 30/06/2020 and also between 01/07/2020 and 31/12/2020.
You get the idea :)
I also would like to know the number of comments within 6 months.
But I suppose this is quite the same process.
Here’s some row of my database :
Here’s how I obtained her with web scraping :
import re
import json
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import datetime
import time
import random
root_url = 'https://fr.trustpilot.com/review/www.gammvert.fr'
urls = [ '{root}?page={i}'.format(root=root_url, i=i) for i in range(1,807) ]
comms = []
notes = []
dates = []
for url in urls:
results = requests.get(url)
time.sleep(20)
soup = BeautifulSoup(results.text, "html.parser")
commentary = soup.find_all('section', class_='review__content')
for container in commentary:
try:
comm = container.find('p', class_ = 'review-content__text').text.strip()
except:
comm = container.find('a', class_ = 'link link--large link--dark').text.strip()
comms.append(comm)
note = container.find('div', class_ = 'star-rating star-rating--medium').find('img')['alt']
notes.append(note)
date_tag = container.div.div.find("div", class_="review-content-header__dates")
date = json.loads(re.search(r"({.*})", str(date_tag)).group(1))["publishedDate"]
dates.append(date)
data = pd.DataFrame({
'comms' : comms,
'notes' : notes,
'dates' : dates
})
data['comms'] = data['comms'].str.replace('\n', '')
data['dates'] = pd.to_datetime(data['dates']).dt.date
data['dates'] = pd.to_datetime(data['dates'])
#print(data.head())
data.to_csv('file.csv', sep=';', index=False)
Here’s the function I used to obtained my comms_clean and month:
def clean_text(text):
text = tokenizer.tokenize(text)
text = nltk.pos_tag(text)
text = [word for word,pos in text if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')
]
text = [word for word in text if not word in stop_words]
text = [word for word in text if len(word) > 2]
final_text = ' '.join( [w for w in text if len(w)>2] ) #remove word with one letter
return final_text
data['comms_clean'] = data['comms'].apply(lambda x : clean_text(x))
data['month'] = data.dates.dt.strftime('%Y-%m')
I suppose we can get that with .dt in datetime packages but I didn't find out how. Do you have any idea or indication in order to get that ?
Thank you :)

IndexError: list index out of range with Regular expression

I am trying to scrape data from this link
https://www.seloger.com/
and I get this error, I don't understand what's wrong because I already tried this code before and it worked
import re
import requests
import csv
import json
with open("selog.csv", "w", newline="") as f:
writer = csv.writer(f)
writer.writerow(["id", "Type", "Prix", "Code_postal", "Ville", "Departement", "Nombre_pieces", "Nbr_chambres", "Type_cuisine", "Surface"])
for i in range(1, 500):
url = str('https://www.seloger.com/list.htm?tri=initial&idtypebien=1,2&pxMax=3000000&div=2238&idtt=2,5&naturebien=1,2,4&LISTING-LISTpg=' + str(i))
r = requests.get(url, headers = {'User-Agent' : 'Mozilla/5.0'})
p = re.compile('var ava_data =(.*);\r\n\s+ava_data\.logged = logged;', re.DOTALL)
x = p.findall(r.text)[0].strip().replace('\r\n ','').replace('\xa0',' ').replace('\\','\\\\')
x = re.sub(r'\s{2,}|\\r\\n', '', x)
data = json.loads(x)
f = csv.writer(open("Seloger.csv", "wb+"))
for product in data['products']:
ID = product['idannonce']
prix = product['prix']
surface = product['surface']
code_postal = product['codepostal']
nombre_pieces = product['nb_pieces']
nbr_chambres = product['nb_chambres']
Type = product['typedebien']
type_cuisine = product['idtypecuisine']
ville = product['ville']
departement = product['departement']
etage = product['etage']
writer.writerow([ID, Type, prix, code_postal, ville, departement, nombre_pieces, nbr_chambres, type_cuisine, surface])
this the error :
Traceback (most recent call last):
File "Seloger.py", line 20, in <module>
x = p.findall(r.text)[0].strip().replace('\r\n ','').replace('\xa0',' ').replace('\\','\\\\')
IndexError: list index out of range
This line is wrong:
x = p.findall(r.text)[0].strip().replace('\r\n ','').replace('\xa0',' ').replace('\\','\\\\')
what you need to find in text?
for working scraped on text you need change above line to:
x = r.text.strip().replace('\r\n ','').replace('\xa0',' ').replace('\\','\\\\')
and then finding something you need
The error occurs because sometimes there is no match, and you are trying to access a non-existing item in an empty list. The same result can be reproduced with print(re.findall("s", "d")[0]).
To fix the issue, replace x = p.findall(r.text)[0].strip().replace('\r\n ','').replace('\xa0',' ').replace('\\','\\\\') line with
x = ''
xm = p.search(r.text)
if xm:
x = xm.group(1).strip().replace('\r\n ','').replace('\xa0',' ').replace('\\','\\\\')
NOTES
When you use p.findall(r.text)[0], you want to get the first match in the input, so re.search is best here as it only returns the first match
To obtain the substirng captured in the first capturing group, you need to use matchObject.grou[p(1)
if xm: is important: if there is no match, x will remain an empty string, else, it will be assigned the modified value in Group 1.

Trying to create a matrix list of excel rows as dictionary keys

I am trying create a dictionary of matrix or double arrays[list] in python and this is what I have so far. I created the dictionary already and assigned it the proper keys. I have an excel file that I was able to successfully parse a but the first row of the excel table contains duplicate UPN numbers at times (aka car parts that can fit multiple cars) I want to scan this portion of the row and compare it to the row after and if they match then assign it to a matrix and have that be the value of the key (which is going to be the UPN aka the part that may fit multiple cars). But I can't seem to crack it. I am sort of new to python and this is a low pay gig but still. What the heck is going on?
import csv
import collections
import sys
from collections import defaultdict
from Car_of_Interest import Car_of_Interest
import json
import numbers
import enum
import pickle
import shelve
#import dill as pickle
carDataFrame = {}
def main ():
data = csv.open(r"C:\Users\Ultrarev\Desktop\Duplicator-fier\Book1.csv")
data = csv.reader(data)
print("Data: (testing)")
def f7(seq):
seen = set()
seen_add = seen.add
return [x for x in seq if not (x in seen or seen_add(x))]
with open(r"C:\Users\Ultrarev\Desktop\Emeran-Parser\Book1.csv", newline = '') as csvfile:
data = csv.reader(csvfile)
rowExcel = []
kPN = []
car_info = []
UPN = ()
cUPN = []
CARS = {}
#UPN_len = len(UPN)
for row in data:
open(r"C:\Users\Ultrarev\Desktop\Emeran-Parser\Book1.csv")
rowExcel.append(row)
car_info.append(row[2])
car_info.append(row[3])
car_info.append(row[4])
#print('-> '.join(row))
for row in rowExcel:
kPN.append(row[0])
# Convert to string.
list1 = kPN
str1 = ' '.join(str(e) for e in list1)
#Remove Duplicates in kPN.
UPN = f7(kPN)
l = UPN.__sizeof__()
print("kPN: ")
print(kPN)
print("Data: ")
print(data)
print("UPN: ")
print(UPN)
print("Major Car Information of File to be Parsed into HTML: ")
print(rowExcel)
print("Number of UPN's: ")
print(len(UPN))
print(len(kPN))
#for n in UPN:
data = open(r"C:\Users\Ultrarev\Desktop\Emeran-Parser\Book1.csv")
#create the foundation of the dictionary.
mydict=dict()
SANAD = []
for n in range(len(UPN)):
if (rowExcel[n][0] == rowExcel[n+1][0]):
while (rowExcel[n][0] == rowExcel[n+1][0]):
SANAD.append(rowExcel[n])
mydict.update({UPN[n]: SANAD}) #'tag %s' % n
print(mydict)
I am expecting mydict to print a dictionary with the UPN's as keys and the values as a matrix of rows with the same UPN. I am also expecting that my Python knowledge is not up to par for this project.
EDIT:
I created two other classes one called Car_of_Interest.py which would be a kind of factory class that produces car_I objects that store each cars information.
import pickle
#import dill
class Car_I:
def init(self,year,make,model,car_I):
data.self = data
x.self = year
y.self = make
z.self = model
data = {"year":year,
"mode":model,
"make":make
}
def __add__(self, other):
return 1
def printData(self):
print(x, y , z)
def honk_the_Horn(self):
print("Honk! Honk!")
def PRINT_Dic(self):
L = {}
L = {UPN:(x,y,z)}
print(L)
def make_INTERNAL_LABEL(self,label):
lUPN.self = label
return("lUPN")
import pickle
#import dill
class Car_I:
def init(self,year,make,model,car_I):
data.self = data
x.self = year
y.self = make
z.self = model
data = {"year":year,
"mode":model,
"make":make
}
def __add__(self, other):
return 1
def printData(self):
print(x, y , z)
def honk_the_Horn(self):
print("Honk! Honk!")
def PRINT_Dic(self):
L = {}
L = {UPN:(x,y,z)}
print(L)
def make_INTERNAL_LABEL(self,label):
lUPN.self = label
return("lUPN")
Would storing this information as a matrix OR going the car_of_Interest ->Car_I(n...) be a better approach?

Can't figure out how to properly output my data

I'm a relative novice at python but yet, somehow managed to build a scraper for Instagram. I now want to take this one step further and output the 5 most commonly used hashtags from an IG profile into my CSV output file.
Current output:
I've managed to isolate the 5 most commonly used hashtags, but I get this result in my csv:
[('#striveforgreatness', 3), ('#jamesgang', 3), ('#thekidfromakron',
2), ('#togetherwecanchangetheworld', 1), ('#halloweenchronicles', 1)]
Desired output:
What I'm looking to end up with in the end is having 5 columns at the end of my .CSV outputting the X-th most commonly used value.
So something in the lines of this:
I've Googled for a while and managed to isolate them separately, but I always end up with '('#thekidfromakron', 2)' as an output. I seem to be missing some part of the puzzle :(.
Here is what I'm working with at the moment:
import csv
import requests
from bs4 import BeautifulSoup
import json
import re
import time
from collections import Counter
ts = time.gmtime()
def get_csv_header(top_numb):
fieldnames = ['USER','MEDIA COUNT','FOLLOWERCOUNT','TOTAL LIKES','TOTAL COMMENTS','ER','ER IN %', 'BIO', 'ALL CAPTION TEXT','HASHTAGS COUNTED','MOST COMMON HASHTAGS']
return fieldnames
def write_csv_header(filename, headers):
with open(filename, 'w', newline='') as f_out:
writer = csv.DictWriter(f_out, fieldnames=headers)
writer.writeheader()
return
def read_user_name(t_file):
with open(t_file) as f:
user_list = f.read().splitlines()
return user_list
if __name__ == '__main__':
# HERE YOU CAN SPECIFY YOUR USERLIST FILE NAME,
# Which contains a list of usernames's BY DEFAULT <current working directory>/userlist.txt
USER_FILE = 'userlist.txt'
# HERE YOU CAN SPECIFY YOUR DATA FILE NAME, BY DEFAULT (data.csv)', Where your final result stays
DATA_FILE = 'users_with_er.csv'
MAX_POST = 12 # MAX POST
print('Starting the engagement calculations... Please wait until it finishes!')
users = read_user_name(USER_FILE)
""" Writing data to csv file """
csv_headers = get_csv_header(MAX_POST)
write_csv_header(DATA_FILE, csv_headers)
for user in users:
post_info = {'USER': user}
url = 'https://www.instagram.com/' + user + '/'
#for troubleshooting, un-comment the next two lines:
#print(user)
#print(url)
try:
r = requests.get(url)
if r.status_code != 200:
print(timestamp,' user {0} not found or page unavailable! Skipping...'.format(user))
continue
soup = BeautifulSoup(r.content, "html.parser")
scripts = soup.find_all('script', type="text/javascript", text=re.compile('window._sharedData'))
stringified_json = scripts[0].get_text().replace('window._sharedData = ', '')[:-1]
j = json.loads(stringified_json)['entry_data']['ProfilePage'][0]
timestamp = time.strftime("%d-%m-%Y %H:%M:%S", ts)
except ValueError:
print(timestamp,'ValueError for username {0}...Skipping...'.format(user))
continue
except IndexError as error:
# Output expected IndexErrors.
print(timestamp, error)
continue
if j['graphql']['user']['edge_followed_by']['count'] <=0:
print(timestamp,'user {0} has no followers! Skipping...'.format(user))
continue
if j['graphql']['user']['edge_owner_to_timeline_media']['count'] <12:
print(timestamp,'user {0} has less than 12 posts! Skipping...'.format(user))
continue
if j['graphql']['user']['is_private'] is True:
print(timestamp,'user {0} has a private profile! Skipping...'.format(user))
continue
media_count = j['graphql']['user']['edge_owner_to_timeline_media']['count']
accountname = j['graphql']['user']['username']
followercount = j['graphql']['user']['edge_followed_by']['count']
bio = j['graphql']['user']['biography']
i = 0
total_likes = 0
total_comments = 0
all_captiontext = ''
while i <= 11:
total_likes += j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_liked_by']['count']
total_comments += j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_media_to_comment']['count']
captions = j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_media_to_caption']
caption_detail = captions['edges'][0]['node']['text']
all_captiontext += caption_detail
i += 1
engagement_rate_percentage = '{0:.4f}'.format((((total_likes + total_comments) / followercount)/12)*100) + '%'
engagement_rate = (((total_likes + total_comments) / followercount)/12*100)
#isolate and count hashtags
hashtags = re.findall(r'#\w*', all_captiontext)
hashtags_counted = Counter(hashtags)
most_common = hashtags_counted.most_common(5)
with open('users_with_er.csv', 'a', newline='', encoding='utf-8') as data_out:
print(timestamp,'Writing Data for user {0}...'.format(user))
post_info["USER"] = accountname
post_info["FOLLOWERCOUNT"] = followercount
post_info["MEDIA COUNT"] = media_count
post_info["TOTAL LIKES"] = total_likes
post_info["TOTAL COMMENTS"] = total_comments
post_info["ER"] = engagement_rate
post_info["ER IN %"] = engagement_rate_percentage
post_info["BIO"] = bio
post_info["ALL CAPTION TEXT"] = all_captiontext
post_info["HASHTAGS COUNTED"] = hashtags_counted
csv_writer = csv.DictWriter(data_out, fieldnames=csv_headers)
csv_writer.writerow(post_info)
""" Done with the script """
print('ALL DONE !!!! ')
The code that goes before this simply scrapes the webpage, and compiles all the captions from the last 12 posts into "all_captiontext".
Any help to solve this (probably simple) issue would be greatly appreciated as I've been struggling with this for days (again, I'm a noob :') ).
Replace line
post_info["MOST COMMON HASHTAGS"] = most_common
with:
for i, counter_tuple in enumerate(most_common):
tag_name = counter_tuple[0].replace('#','')
label = "Top %d" % (i + 1)
post_info[label] = tag_name
There's also a bit of code missing. For example, your code doesn't include csv_headers variable, which I suppose would be
csv_headers = post_info.keys()
It also seems that you're opening a file to write just one row. I don't think that's intended, so what you would like to do is to collect the results into a list of dictionaries. A cleaner solution would be to use pandas' dataframe, which you can output straight into a csv file.
most_common being the output of the call to hashtags_counted.most_common, I had a look at the doc here: https://docs.python.org/2/library/collections.html#collections.Counter.most_common
Output if formatted the following : [(key, value), (key, value), ...] and ordered in decreasing importance of number of occurences.
Hence, to get only the name and not the number of occurence, you should replace:
post_info["MOST COMMON HASHTAGS"] = most_common
by
post_info["MOST COMMON HASHTAGS"] = [x[0] for x in most_common]
You have a list of tuple. This statement builds on the fly the list of the first element of each tuple, keeping the sorting order.

Python: splitting parsed items into a CSV file

I got advice from Jamie Bull and PM 2Ring to use the CSV module for the output of my web scraper . I'm nearly done but have an issue with some parsed items that are separated by a colon or hyphen. I'm wanting those items split into two items in the current list.
Current output:
GB,16,19,255,1,26:40,19,13,4,2,6-12,0-1,255,57,4.5,80,21,3.8,175,23-33,4.9,3,14,1,4,38.3,8,65,1,0
Sea,36,25,398,1,33:20,25,8,13,4,4-11,1-1,398,66,6.0,207,37,5.6,191,19-28,6.6,1,0,0,2,33.0,4,69,2,1
Desired output:(The issues/differences are in bold)
GB,16,19,255,1,26,40,19,13,4,2,6,12,0,1,255,57,4.5,80,21,3.8,175,23,33,4.9,3,14,1,4,38.3,8,65,1,0
Sea,36,25,398,1,33,20,25,8,13,4,4,11,1,1,398,66,6,207,37,5.6,191,19,28,6.6,1,0,0,2,33,4,69,2,1
I am unsure where or how to make these changes. I also don't know if regex is needed. Obviously I could handle this in notepad or Excel but my goal is to handle all this in Python.
If you run the program, the above results are from the 2014 season, week 1.
import requests
import re
from bs4 import BeautifulSoup
import csv
year_entry = raw_input("Enter year: ")
week_entry = raw_input("Enter week number: ")
week_link = requests.get("http://sports.yahoo.com/nfl/scoreboard/?week=" + week_entry + "&phase=2&season=" + year_entry)
page_content = BeautifulSoup(week_link.content)
a_links = page_content.find_all('tr', {'class': 'game link'})
csvfile = open('NFL_2014.csv', 'a')
writer = csv.writer(csvfile)
for link in a_links:
r = 'http://www.sports.yahoo.com' + str(link.attrs['data-url'])
r_get = requests.get(r)
soup = BeautifulSoup(r_get.content)
stats = soup.find_all("td", {'class':'stat-value'})
teams = soup.find_all("th", {'class':'stat-value'})
scores = soup.find_all('dd', {"class": 'score'})
try:
away_game_stats = []
home_game_stats = []
statistic = []
game_score = scores[-1]
game_score = game_score.text
x = game_score.split(" ")
away_score = x[1]
home_score = x[4]
home_team = teams[1]
away_team = teams[0]
away_team_stats = stats[0::2]
home_team_stats = stats[1::2]
away_game_stats.append(away_team.text)
away_game_stats.append(away_score)
home_game_stats.append(home_team.text)
home_game_stats.append(home_score)
for stats in away_team_stats:
text = stats.text.strip("").encode('utf-8')
away_game_stats.append(text)
writer.writerow(away_game_stats)
for stats in home_team_stats:
text = stats.text.strip("").encode('utf-8')
home_game_stats.append(text)
writer.writerow(home_game_stats)
except:
pass
csvfile.close()
Any help is greatly appreciated. This is my first program and searching this board has been a great resource.
Thanks,
JT
You can use regular expressions to split the strings and then "flatten" the list in order to avoid the grouping by quotation marks like this:
Substitute
writer.writerow(away_game_stats)
with
away_game_stats = [re.split(r"-|:",x) for x in away_game_stats]
writer.writerow([x for y in away_game_stats for x in y])
(and same for writer.writerow(home_game_stats))
import re
print re.sub(r"-|:",",",test_string)
See demo.
https://regex101.com/r/aQ3zJ3/2

Categories

Resources