# variable as an object
customer1_age = 38
customer1_high = 178
# list as an object
customer = [38, 'Divorced', 1, 56.3, ["","",""], {}]
# different types in one object
type(customer)
# list
but what is wrong with lists?
a = [1,2,3]
b = [4,5,6]
# bad adding
print(f"a+b: {a+b}")
# multiply even worst
try:
print(a*b)
except TypeError:
print("no-defined operation")
# no-defined operation
They are too slow for computations.
Homework. Describe what is and what for and where You can use: try - expect expressions.
MUST HAVE : You have known all about lists, tuples, dictionaries in Python
new types for data
solution in python - new array object.
import numpy as np
aa = np.array([1,2,3])
bb = np.array([4,5,6])
type(aa)
Operations for aggregations
print(f"aa+bb: {aa+bb}")
# adding is correct as for vectors
try:
print(aa*bb)
except TypeError:
print("no-defined operation")
# correct multiplication element by element
Create array from list (and generators)
x = np.array(range(4))
x.shape
# vector and matrix operations
A = np.array([range(4),range(4)])
# transposition row i -> column j, column j -> row i
A.T
# 0-dim object
scalar = np.array(5)
print(scalar.ndim)
# 1-dim object
vector_1d = np.array([3, 5, 7])
print(vector_1d.ndim)
# 2-dim tabular data (without feature names)
# 2 rows for 3 features
matrix_2d = np.array([[1,2,3],[3,4,5]])
print(matrix_2d.ndim)
# 2-dim picture 4 x 4 pixel
picture_2d = np.random.uniform(size=(20,20))
Homework: Add colors for picture
Homework: display picture in matplotlib
from matplotlib import pyplot as plt plt.imshow(picture_2d, interpolation='nearest') plt.show()
Sources of Structured data
Data from sklearn datasets:
# from sklearn.datasets import <tab>
from sklearn.datasets import load_iris
iris = load_iris()
# what is type for iris object
print(type(iris))
# find all keys
iris.keys()
#
print(iris.DESCR)
numpy array are nice but not for Data Scientists.
Solution: Pandas DataFrame
import pandas as pd
import numpy as np
# create DataFrame
df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
columns= iris['feature_names'] + ['target'])
Very easy operations on table:
# show first
df.head()
# show last
df.tail(10)
# show info about NaN values and a type of each column.
df.info()
# statistics
df.describe()
Selection (SQL: Select * from tab)
# new features
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)
# remove features (columns)
df = df.drop(columns=['target'])
# filtering first 100 rows and 4'th column
X = df.iloc[:100,[0,2]].values
y = df.iloc[0:100,4].values
y = np.where(y == 'setosa',-1,1)
Easy plot
from matplotlib import pyplot as plt
plt.scatter(X[:50,0],X[:50,1],color='red', marker='o',label='setosa')
plt.scatter(X[50:100,0],X[50:100,1],color='blue', marker='x',label='versicolor')
plt.xlabel('sepal length (cm)')
plt.ylabel('petal length (cm)')
plt.legend(loc='upper left')
plt.show()
MUST HAVE: filtering, aggs, creating numpy and pandas objects
from sklearn import datasets
X, y = datasets.make_classification(n_samples=10**4,
n_features=20, n_informative=2, n_redundant=2)
from sklearn.ensemble import RandomForestClassifier
# model ?
train_samples = 7000
X_train = X[:train_samples]
X_test = X[train_samples:]
y_train = y[:train_samples]
y_test = y[train_samples:]
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
Homework. Prepare 100 000 rows of a data for any unsupervised problem
As a table in database:
df.head()
from sqlalchemy import create_engine
engine = create_engine('sqlite:///irysy.db')
df.to_sql('dane', con=engine, index=False)
check if is all right
a = engine.execute("SELECT * FROM dane").fetchall()
df2 = pd.DataFrame(a, columns=df.columns)
df2.head()
Dump with pickle
import pickle
with open('model.pkl', "wb") as picklefile:
pickle.dump(rfc, picklefile)
load
with open('model.pkl',"rb") as picklefile:
model = pickle.load(picklefile)
Homework. what with python env if You want read your model in other place ? Homework. Save model (or other object) with joblib library
Let’s start with pictures
from tensorflow.keras.datasets import mnist
(x_train, y_train),(x_test, y_test) = mnist.load_data()
assert x_train.shape == (60000, 28, 28)
assert x_test.shape == (10000, 28, 28)
assert y_train.shape == (60000,)
assert y_test.shape == (10000,)
import matplotlib.pyplot as plt
plt.imshow(X_train[0], cmap=plt.cm.binary)
Other Version with fastai library
from fastai.vision.all import *
path = untar_data(URLs.MNIST_SAMPLE)
# choose 3
thress = (path/'train'/'3').ls().sorted()
im3_path = thress[1]
im3 = Image.open(im3_path)
# save picture as a tensor and slice part of data
im3_t = tensor(im3)[4:15,4:22]
im3_df = pd.DataFrame(im3_t)
im3_df.style.set_properties(**{'font-size':'6pt'})\
.background_gradient('Greys')
Other example with colored pictures
from tensorflow.keras.datasets import cifar10
(x_train, y_train),(x_test, y_test) = cifar10.load_data()
assert x_train.shape == (50000, 32, 32, 3)
assert x_test.shape == (10000, 32, 32, 3)
assert y_train.shape == (50000, 1)
assert y_test.shape == (10000, 1)
With PyTorch library
import torch
a = torch.tensor([1,2,3])
b = torch.tensor([3,4,5])
c = a+b
print(c)
d = torch.tensor([[1,2,3],[3,4,5]])
d.size()
d.T
device = "cuda" if torch.cuda.is_available() else "cpu"
x = torch.tensor([[1,2,3],[4,5,6]], device=device)
x.device
JSON objects
import json
person = '{"name": "Alice", "languages": ["English", "French"]}'
person_dict = json.loads(person)
print( person_dict)
Easy to save data (or object ?)
%%file test.json
{"name": "Alice", "languages": ["English", "French"]}
LOAD from file
with open('test.json') as f:
data = json.load(f)
print(data)
SAVE to file
with open('person.json', 'w') as json_file:
json.dump(person_dict, json_file)
4+4
"napis" + " inny napis"
type("napis")
"napis".__add__("inny napis")
"napis". # <press tab>
"napis".__dir__()
import numpy as np
np.array([1,2,3]).__dir__()
def moja_funkcja():
pass
class Nazwa(object):
pass
a = Nazwa()
b = Nazwa()
[Nazwa() for x in range(5)]
b.__dir__()