-- PTML helpful tools --

# -- PTML helpful tools -- ## Pandas ### imports: ```python=3.9 import pandas as pd ``` ### Functions: ```python=3.9 # taille d'un dataframe: df1.shape # read_csv: df1 = pd.read_csv('db.csv', sep=',', index_col=0, dtype={'col': str}) # split: sub_df1 = df1['col2parse'].str.split(',', expand=True) # expand pour separer le split en plusieurs colonnes. #rename: df1.rename(columns={'col1':'Col 1'}, inplace=True) # astype: df1['col'] = df1['col'].astype(float) # Join (merge c'est pareil): df1.join(df2, on='pivot_column', how='inner') #et si jamais il y a un pb de uint64/object df1.join(df2.set_index('pivot_column'), how='inner') # drop: df1.drop(columns=['col2drop'], inplace=True) # dropna: df1.dropna(inplace=True) # Group By operations: df1.groupby('col').mean() df1.groupby('col')[['other_col']].mean() # si vus voulez le mean d'une colonne specifique. # Serie map operations: df1['col'].map({'val1':'1', 'val2':'2'}, na_action='ignore') df1['col'].map(lambda x : x*2 if (x % 2 == 0) else x) ''' si on donne un dict a map, il associe a chaque key la value, et mettra tout le reste a NaN (attention !). Si on met une lambda fonction, on peut tout faire, je prefere. ''' ``` ## sklearn ### imports: ```python=3.9 from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error, accuracy_score, average_precision_score, classification_report from sklearn.metrics import confusion_matrix, plot_confusion_matrix from sklearn.linear_model import LinearRegression, SGDClassifier # SGD = Stochastic Gradient Descent from sklearn.svm import SVC from sklearn.neighbors import NearestNeighbors, KDTree from sklearn.ensemble import RandomForestClassifier from sklearn.neural_network import MLPClassifier, MLPRegressor ``` ### Functions: ```python=3.9 #train/test split: x_train, x_test, y_train, y_test = train_test_split(df1[['col1', 'col2']], df1['col3'], train_size=0,75) # Linear regression: lin_reg = LinearRegression().fit(x_train, y_train) y_pred = lin_reg.predict(x_test) print(lin_reg.score(x_test, y_test)) print(lin_reg.coefs_) # pente de chaque droite de regression print(mean_squared_error(y_test, y_pred)) # ou n'importe laquelle des metriques. # SVM: svm = SVC(gamma='auto').fit(x_train, y_train) # puis toujours pareil pour les metriques. # Nearest Neighbors: neighbors = NearestNeighbors(n_neighbors=42, algorithm='ball_tree').fit(X) # pas de x_train/y_train ici, on clusterize, on classifie pas. rep_dist, rep_index = neighbors.kneighbors(X_test) # rep = representant le plus proche. rf = RandomForestClassifier().fit(x_train, y_train) print(rf.score(x_test, y_test)) # neural network (on sait jamais hein): clf = MLPClassifier(solver='lbfgs', alpha=1e-5, max_iter=300) clf.fit(x_train, y_train) clf.predict_proba(x_test) # pour du neural network, avoir les proba c'est cool. ``` ## Matplotlib ### imports: ```python=3.9 import matplotlib.pyplot as plt ``` ### Functions: ```python=3.9 # plot simple sans pandas: plt.plot(x, y, color=z) plt.scatter(x, y, color=z) #plot simple avec pandas: df1.plot(x='col1', y='col2', c='col3', cmap='viridis') df1.scatter(x='col1', y='col2', c='col3', cmap='gray') #subplots et on melange les deux: fig, axes = plt.subplots(nrows=2, ncols=2) df1.scatter(x='col1', y='col2', ax=axes[0,1]) axes[0,1].title.set_text('hello friend') axes[1,0].plot(x, y) axes[1,0].title.set_text('hello you') ```