Real-Time Analytics Lectures for SGH students

Laboratory 1 - Different types of data in Python

Structured and unstructured data in Python

python types for data

# variable as an object
customer1_age = 38
customer1_high = 178
# list as an object
customer = [38, 'Divorced', 1, 56.3, ["","",""], {}]
# different types in one object
type(customer)
# list

but what is wrong with lists?

a = [1,2,3]
b = [4,5,6]

# bad adding
print(f"a+b: {a+b}")
# multiply even worst
try:
    print(a*b)
except TypeError:
    print("no-defined operation")
# no-defined operation

They are too slow for computations.

Homework. Describe what is and what for and where You can use: try - expect expressions.

MUST HAVE : You have known all about lists, tuples, dictionaries in Python

new types for data

solution in python - new array object.

import numpy as np

aa = np.array([1,2,3])
bb = np.array([4,5,6])

type(aa)

Operations for aggregations

print(f"aa+bb: {aa+bb}")
# adding is correct as for vectors
try:
    print(aa*bb)
except TypeError:
    print("no-defined operation")
# correct multiplication element by element

Create array from list (and generators)

x = np.array(range(4))
x.shape

# vector and matrix operations 

A = np.array([range(4),range(4)])
# transposition  row i -> column j, column j -> row i 
A.T

# 0-dim object
scalar = np.array(5)
print(scalar.ndim)
# 1-dim object
vector_1d = np.array([3, 5, 7])
print(vector_1d.ndim)
# 2-dim tabular data (without feature names) 
# 2 rows for 3 features
matrix_2d = np.array([[1,2,3],[3,4,5]])
print(matrix_2d.ndim)
# 2-dim picture 4 x 4 pixel
picture_2d = np.random.uniform(size=(20,20))

Homework: Add colors for picture

Homework: display picture in matplotlib

from matplotlib import pyplot as plt
plt.imshow(picture_2d, interpolation='nearest')
plt.show()

Sources of Structured data

Data from sklearn datasets:

# from sklearn.datasets import <tab> 
from sklearn.datasets import load_iris
iris = load_iris()
# what is type for iris object
print(type(iris))
# find all keys
iris.keys()
# 
print(iris.DESCR)

numpy array are nice but not for Data Scientists.

Solution: Pandas DataFrame

import pandas as pd
import numpy as np

# create DataFrame
df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                  columns= iris['feature_names'] + ['target'])

Very easy operations on table:

# show first
df.head()
# show last
df.tail(10)
# show info about NaN values and a type of each column.
df.info()
# statistics
df.describe()

Selection (SQL: Select * from tab)

# new features
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)
# remove features (columns) 
df = df.drop(columns=['target'])
# filtering first 100 rows and 4'th column
X = df.iloc[:100,[0,2]].values
y = df.iloc[0:100,4].values
y = np.where(y == 'setosa',-1,1)

Easy plot

from matplotlib import pyplot as plt 
plt.scatter(X[:50,0],X[:50,1],color='red', marker='o',label='setosa')
plt.scatter(X[50:100,0],X[50:100,1],color='blue', marker='x',label='versicolor')
plt.xlabel('sepal length (cm)')
plt.ylabel('petal length (cm)')
plt.legend(loc='upper left')
plt.show()

MUST HAVE: filtering, aggs, creating numpy and pandas objects

Fast data generators

from sklearn import datasets
X, y = datasets.make_classification(n_samples=10**4,
n_features=20, n_informative=2, n_redundant=2)
from sklearn.ensemble import RandomForestClassifier
# model ? 
train_samples = 7000

X_train = X[:train_samples]
X_test = X[train_samples:]
y_train = y[:train_samples]
y_test = y[train_samples:]

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

Homework. Prepare 100 000 rows of a data for any unsupervised problem

How we can save our data ?

As a table in database:

df.head()

from sqlalchemy import create_engine
engine = create_engine('sqlite:///irysy.db')
df.to_sql('dane', con=engine, index=False)

check if is all right

a = engine.execute("SELECT * FROM dane").fetchall()
df2 = pd.DataFrame(a, columns=df.columns)
df2.head()

But how You save Your model ?

Dump with pickle

import pickle
with open('model.pkl', "wb") as picklefile:
    pickle.dump(rfc, picklefile)

load

with open('model.pkl',"rb") as picklefile:
    model = pickle.load(picklefile)

Homework. what with python env if You want read your model in other place ? Homework. Save model (or other object) with joblib library

Unstructured data

Let’s start with pictures

from tensorflow.keras.datasets import mnist

(x_train, y_train),(x_test, y_test) = mnist.load_data()

assert x_train.shape == (60000, 28, 28)
assert x_test.shape == (10000, 28, 28)
assert y_train.shape == (60000,)
assert y_test.shape == (10000,)


import matplotlib.pyplot as plt
plt.imshow(X_train[0], cmap=plt.cm.binary)

Other Version with fastai library

from fastai.vision.all import *

path = untar_data(URLs.MNIST_SAMPLE)
# choose 3 
thress = (path/'train'/'3').ls().sorted()
im3_path = thress[1]
im3 = Image.open(im3_path)
# save picture as a tensor and slice part of data
im3_t = tensor(im3)[4:15,4:22]

im3_df = pd.DataFrame(im3_t)
im3_df.style.set_properties(**{'font-size':'6pt'})\
.background_gradient('Greys')

Other example with colored pictures

from tensorflow.keras.datasets import cifar10

(x_train, y_train),(x_test, y_test) = cifar10.load_data()
assert x_train.shape == (50000, 32, 32, 3)
assert x_test.shape == (10000, 32, 32, 3)
assert y_train.shape == (50000, 1)
assert y_test.shape == (10000, 1)

With PyTorch library

import torch 
a = torch.tensor([1,2,3])
b = torch.tensor([3,4,5])
c = a+b
print(c)

d = torch.tensor([[1,2,3],[3,4,5]])
d.size()

d.T

device = "cuda" if torch.cuda.is_available() else "cpu"

x = torch.tensor([[1,2,3],[4,5,6]], device=device)
x.device

JSON objects

import json
person = '{"name": "Alice", "languages": ["English", "French"]}'
person_dict = json.loads(person)

print( person_dict)

Easy to save data (or object ?)

%%file test.json
{"name": "Alice", "languages": ["English", "French"]}

LOAD from file

with open('test.json') as f:
    data = json.load(f)

print(data)

SAVE to file

with open('person.json', 'w') as json_file:
    json.dump(person_dict, json_file)

Python types as objects

4+4
"napis" + " inny napis"
type("napis")
"napis".__add__("inny napis")

"napis". # <press tab>
"napis".__dir__()

import numpy as np
np.array([1,2,3]).__dir__()

CLASS 0 - Empty Class

def moja_funkcja():
    pass

class Nazwa(object):
    pass

a = Nazwa()
b = Nazwa()

[Nazwa() for x in range(5)]

b.__dir__()