# -- PTML helpful tools --
## Pandas
### imports:
```python=3.9
import pandas as pd
```
### Functions:
```python=3.9
# taille d'un dataframe:
df1.shape
# read_csv:
df1 = pd.read_csv('db.csv', sep=',', index_col=0, dtype={'col': str})
# split:
sub_df1 = df1['col2parse'].str.split(',', expand=True) # expand pour separer le split en plusieurs colonnes.
#rename:
df1.rename(columns={'col1':'Col 1'}, inplace=True)
# astype:
df1['col'] = df1['col'].astype(float)
# Join (merge c'est pareil):
df1.join(df2, on='pivot_column', how='inner') #et si jamais il y a un pb de uint64/object
df1.join(df2.set_index('pivot_column'), how='inner')
# drop:
df1.drop(columns=['col2drop'], inplace=True)
# dropna:
df1.dropna(inplace=True)
# Group By operations:
df1.groupby('col').mean()
df1.groupby('col')[['other_col']].mean() # si vus voulez le mean d'une colonne specifique.
# Serie map operations:
df1['col'].map({'val1':'1', 'val2':'2'}, na_action='ignore')
df1['col'].map(lambda x : x*2 if (x % 2 == 0) else x)
'''
si on donne un dict a map, il associe a chaque key la value,
et mettra tout le reste a NaN (attention !).
Si on met une lambda fonction, on peut tout faire, je prefere.
'''
```
## sklearn
### imports:
```python=3.9
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, average_precision_score, classification_report
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.linear_model import LinearRegression, SGDClassifier # SGD = Stochastic Gradient Descent
from sklearn.svm import SVC
from sklearn.neighbors import NearestNeighbors, KDTree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier, MLPRegressor
```
### Functions:
```python=3.9
#train/test split:
x_train, x_test, y_train, y_test = train_test_split(df1[['col1', 'col2']], df1['col3'], train_size=0,75)
# Linear regression:
lin_reg = LinearRegression().fit(x_train, y_train)
y_pred = lin_reg.predict(x_test)
print(lin_reg.score(x_test, y_test))
print(lin_reg.coefs_) # pente de chaque droite de regression
print(mean_squared_error(y_test, y_pred)) # ou n'importe laquelle des metriques.
# SVM:
svm = SVC(gamma='auto').fit(x_train, y_train)
# puis toujours pareil pour les metriques.
# Nearest Neighbors:
neighbors = NearestNeighbors(n_neighbors=42, algorithm='ball_tree').fit(X) # pas de x_train/y_train ici, on clusterize, on classifie pas.
rep_dist, rep_index = neighbors.kneighbors(X_test) # rep = representant le plus proche.
rf = RandomForestClassifier().fit(x_train, y_train)
print(rf.score(x_test, y_test))
# neural network (on sait jamais hein):
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, max_iter=300)
clf.fit(x_train, y_train)
clf.predict_proba(x_test) # pour du neural network, avoir les proba c'est cool.
```
## Matplotlib
### imports:
```python=3.9
import matplotlib.pyplot as plt
```
### Functions:
```python=3.9
# plot simple sans pandas:
plt.plot(x, y, color=z)
plt.scatter(x, y, color=z)
#plot simple avec pandas:
df1.plot(x='col1', y='col2', c='col3', cmap='viridis')
df1.scatter(x='col1', y='col2', c='col3', cmap='gray')
#subplots et on melange les deux:
fig, axes = plt.subplots(nrows=2, ncols=2)
df1.scatter(x='col1', y='col2', ax=axes[0,1])
axes[0,1].title.set_text('hello friend')
axes[1,0].plot(x, y)
axes[1,0].title.set_text('hello you')
```