Mục lục#

Các loại Machine Learning
- Supervised Learning (Học có giám sát)
- Unsupervised Learning (Học không giám sát)
Các thuật toán cơ bản
Xử lý dữ liệu
- Data Preprocessing
- Feature Engineering
Cross-Validation
Hyperparameter Tuning
Model Evaluation
Deep Learning với TensorFlow
Kết luận

Machine Learning cơ bản cho người mới bắt đầu#

Machine Learning (ML) là một nhánh của Artificial Intelligence (AI) cho phép máy tính học và cải thiện từ dữ liệu mà không cần được lập trình rõ ràng. Trong bài viết này, tôi sẽ giới thiệu những khái niệm cơ bản và thực hành với Python.

1. Các loại Machine Learning#

Supervised Learning (Học có giám sát)#

1
import numpy as np
2
import pandas as pd
3
from sklearn.model_selection import train_test_split
4
from sklearn.linear_model import LinearRegression
5
from sklearn.metrics import mean_squared_error, r2_score
6

7
# Ví dụ: Dự đoán giá nhà
8
# Dữ liệu mẫu
9
data = {
10
    'diện_tích': [100, 150, 200, 250, 300],
11
    'số_phòng': [2, 3, 3, 4, 4],
12
    'giá': [500, 750, 1000, 1250, 1500]
13
}
14

15
df = pd.DataFrame(data)
16
X = df[['diện_tích', 'số_phòng']]
17
y = df['giá']
18

19
# Chia dữ liệu
20
X_train, X_test, y_train, y_test = train_test_split(
21
    X, y, test_size=0.2, random_state=42
22
)
23

24
# Huấn luyện model
25
model = LinearRegression()
26
model.fit(X_train, y_train)
27

28
# Dự đoán
29
y_pred = model.predict(X_test)
30

31
# Đánh giá
32
mse = mean_squared_error(y_test, y_pred)
33
r2 = r2_score(y_test, y_pred)
34

35
print(f'MSE: {mse:.2f}')
36
print(f'R²: {r2:.2f}')

Unsupervised Learning (Học không giám sát)#

1
from sklearn.cluster import KMeans
2
from sklearn.preprocessing import StandardScaler
3
import matplotlib.pyplot as plt
4

5
# Ví dụ: Phân cụm khách hàng
6
# Dữ liệu mẫu
7
customer_data = np.array([
8
    [25, 50000],  # [tuổi, thu nhập]
9
    [30, 60000],
10
    [35, 70000],
11
    [40, 80000],
12
    [45, 90000],
13
    [50, 100000]
14
])
15

16
# Chuẩn hóa dữ liệu
17
scaler = StandardScaler()
18
scaled_data = scaler.fit_transform(customer_data)
19

20
# Phân cụm
21
kmeans = KMeans(n_clusters=3, random_state=42)
22
clusters = kmeans.fit_predict(scaled_data)
23

24
# Visualize
25
plt.scatter(customer_data[:, 0], customer_data[:, 1],
26
           c=clusters, cmap='viridis')
27
plt.xlabel('Tuổi')
28
plt.ylabel('Thu nhập')
29
plt.title('Phân cụm khách hàng')
30
plt.show()

2. Các thuật toán cơ bản#

Linear Regression (Hồi quy tuyến tính)#

1
import numpy as np
2
from sklearn.linear_model import LinearRegression
3
import matplotlib.pyplot as plt
4

5
# Tạo dữ liệu mẫu
6
np.random.seed(42)
7
X = np.random.rand(100, 1) * 10
8
y = 2 * X + 1 + np.random.randn(100, 1) * 0.5
9

10
# Huấn luyện model
11
model = LinearRegression()
12
model.fit(X, y)
13

14
# Dự đoán
15
y_pred = model.predict(X)
16

17
# Visualize
18
plt.scatter(X, y, alpha=0.5, label='Dữ liệu thực')
19
plt.plot(X, y_pred, color='red', label='Dự đoán')
20
plt.xlabel('X')
21
plt.ylabel('y')
22
plt.legend()
23
plt.show()
24

25
print(f'Coefficient: {model.coef_[0][0]:.2f}')
26
print(f'Intercept: {model.intercept_[0]:.2f}')

Logistic Regression (Hồi quy logistic)#

1
from sklearn.linear_model import LogisticRegression
2
from sklearn.metrics import accuracy_score, classification_report
3

4
# Dữ liệu mẫu: Dự đoán spam email
5
X = np.array([
6
    [1, 0, 1, 0],  # [có_link, có_số_điện_thoại, có_từ_khẩn, có_viết_hoa]
7
    [0, 1, 0, 1],
8
    [1, 1, 1, 1],
9
    [0, 0, 0, 0],
10
    [1, 0, 0, 1],
11
    [0, 1, 1, 0]
12
])
13

14
y = np.array([1, 1, 1, 0, 0, 0])  # 1: spam, 0: không spam
15

16
# Huấn luyện model
17
model = LogisticRegression()
18
model.fit(X, y)
19

20
# Dự đoán
21
y_pred = model.predict(X)
22

23
# Đánh giá
24
accuracy = accuracy_score(y, y_pred)
25
print(f'Độ chính xác: {accuracy:.2f}')
26
print(classification_report(y, y_pred))

Decision Tree (Cây quyết định)#

1
from sklearn.tree import DecisionTreeClassifier
2
from sklearn.tree import plot_tree
3
import matplotlib.pyplot as plt
4

5
# Dữ liệu mẫu: Dự đoán mua hàng
6
data = {
7
    'tuổi': [25, 35, 45, 55, 25, 35, 45, 55],
8
    'thu_nhập': ['thấp', 'thấp', 'cao', 'cao', 'cao', 'thấp', 'cao', 'thấp'],
9
    'mua_hàng': [0, 0, 1, 1, 1, 0, 1, 0]
10
}
11

12
df = pd.DataFrame(data)
13

14
# Chuyển đổi dữ liệu
15
from sklearn.preprocessing import LabelEncoder
16
le = LabelEncoder()
17
df['thu_nhập_encoded'] = le.fit_transform(df['thu_nhập'])
18

19
X = df[['tuổi', 'thu_nhập_encoded']]
20
y = df['mua_hàng']
21

22
# Huấn luyện model
23
model = DecisionTreeClassifier(random_state=42)
24
model.fit(X, y)
25

26
# Visualize cây quyết định
27
plt.figure(figsize=(10, 8))
28
plot_tree(model, feature_names=['Tuổi', 'Thu nhập'],
29
          class_names=['Không mua', 'Mua'], filled=True)
30
plt.show()

3. Xử lý dữ liệu#

Data Preprocessing#

1
import pandas as pd
2
from sklearn.preprocessing import StandardScaler, LabelEncoder
3
from sklearn.impute import SimpleImputer
4

5
# Tạo dữ liệu mẫu với missing values
6
data = {
7
    'tuổi': [25, 30, None, 35, 40],
8
    'thu_nhập': [50000, 60000, 70000, None, 90000],
9
    'thành_phố': ['Hà Nội', 'TP.HCM', 'Đà Nẵng', 'Hà Nội', None],
10
    'mua_hàng': [1, 0, 1, 0, 1]
11
}
12

13
df = pd.DataFrame(data)
14

15
# Xử lý missing values
16
# Số: thay bằng median
17
imputer_num = SimpleImputer(strategy='median')
18
df[['tuổi', 'thu_nhập']] = imputer_num.fit_transform(df[['tuổi', 'thu_nhập']])
19

20
# Categorical: thay bằng mode
21
imputer_cat = SimpleImputer(strategy='most_frequent')
22
df['thành_phố'] = imputer_cat.fit_transform(df[['thành_phố']])
23

24
# Encode categorical variables
25
le = LabelEncoder()
26
df['thành_phố_encoded'] = le.fit_transform(df['thành_phố'])
27

28
print("Dữ liệu sau khi xử lý:")
29
print(df)

Feature Engineering#

1
# Tạo features mới
2
df['tuổi_nhóm'] = pd.cut(df['tuổi'], bins=[0, 30, 40, 100],
3
                         labels=['trẻ', 'trung niên', 'cao tuổi'])
4

5
# One-hot encoding
6
df_encoded = pd.get_dummies(df, columns=['tuổi_nhóm'])
7

8
# Scaling
9
scaler = StandardScaler()
10
df_scaled = scaler.fit_transform(df_encoded[['tuổi', 'thu_nhập']])
11

12
print("Features sau khi engineering:")
13
print(df_encoded.head())

4. Cross-Validation#

1
from sklearn.model_selection import cross_val_score
2
from sklearn.ensemble import RandomForestClassifier
3

4
# Sử dụng cross-validation để đánh giá model
5
model = RandomForestClassifier(n_estimators=100, random_state=42)
6

7
# 5-fold cross-validation
8
scores = cross_val_score(model, X, y, cv=5)
9

10
print(f'Độ chính xác trung bình: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})')

5. Hyperparameter Tuning#

1
from sklearn.model_selection import GridSearchCV
2

3
# Tìm hyperparameters tốt nhất
4
param_grid = {
5
    'n_estimators': [50, 100, 200],
6
    'max_depth': [3, 5, 7, None],
7
    'min_samples_split': [2, 5, 10]
8
}
9

10
model = RandomForestClassifier(random_state=42)
11
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
12
grid_search.fit(X, y)
13

14
print(f'Best parameters: {grid_search.best_params_}')
15
print(f'Best score: {grid_search.best_score_:.3f}')

6. Model Evaluation#

1
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve
2
import seaborn as sns
3

4
# Confusion Matrix
5
y_pred = model.predict(X_test)
6
cm = confusion_matrix(y_test, y_pred)
7

8
plt.figure(figsize=(8, 6))
9
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
10
plt.title('Confusion Matrix')
11
plt.ylabel('Thực tế')
12
plt.xlabel('Dự đoán')
13
plt.show()
14

15
# ROC Curve
16
y_pred_proba = model.predict_proba(X_test)[:, 1]
17
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
18
auc = roc_auc_score(y_test, y_pred_proba)
19

20
plt.figure(figsize=(8, 6))
21
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {auc:.3f})')
22
plt.plot([0, 1], [0, 1], 'k--', label='Random')
23
plt.xlabel('False Positive Rate')
24
plt.ylabel('True Positive Rate')
25
plt.title('ROC Curve')
26
plt.legend()
27
plt.show()

7. Deep Learning với TensorFlow#

1
import tensorflow as tf
2
from tensorflow.keras.models import Sequential
3
from tensorflow.keras.layers import Dense, Dropout
4

5
# Tạo neural network đơn giản
6
model = Sequential([
7
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
8
    Dropout(0.2),
9
    Dense(32, activation='relu'),
10
    Dropout(0.2),
11
    Dense(1, activation='sigmoid')
12
])
13

14
model.compile(optimizer='adam',
15
              loss='binary_crossentropy',
16
              metrics=['accuracy'])
17

18
# Huấn luyện
19
history = model.fit(X_train, y_train,
20
                    epochs=100,
21
                    batch_size=32,
22
                    validation_split=0.2,
23
                    verbose=0)
24

25
# Visualize training
26
plt.figure(figsize=(12, 4))
27

28
plt.subplot(1, 2, 1)
29
plt.plot(history.history['accuracy'], label='Training Accuracy')
30
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
31
plt.title('Model Accuracy')
32
plt.xlabel('Epoch')
33
plt.ylabel('Accuracy')
34
plt.legend()
35

36
plt.subplot(1, 2, 2)
37
plt.plot(history.history['loss'], label='Training Loss')
38
plt.plot(history.history['val_loss'], label='Validation Loss')
39
plt.title('Model Loss')
40
plt.xlabel('Epoch')
41
plt.ylabel('Loss')
42
plt.legend()
43

44
plt.tight_layout()
45
plt.show()

Kết luận#

Machine Learning là một lĩnh vực rộng lớn và thú vị. Để thành công trong lĩnh vực này, bạn cần:

1. Nền tảng vững chắc#

Toán học: Linear Algebra, Calculus, Statistics
Lập trình: Python, R, SQL
Kiến thức domain: Hiểu rõ lĩnh vực ứng dụng

2. Thực hành thường xuyên#

Làm các project thực tế
Tham gia competitions (Kaggle, etc.)
Đọc papers và implement

3. Cập nhật xu hướng#

Deep Learning
Reinforcement Learning
AutoML
MLOps

4. Tools và Frameworks#

Scikit-learn: ML cơ bản
TensorFlow/PyTorch: Deep Learning
Pandas/NumPy: Data manipulation
Matplotlib/Seaborn: Visualization

Hãy bắt đầu với những khái niệm cơ bản và dần dần tiến tới các thuật toán phức tạp hơn! 🚀