शून्य से गो AI बनाना

यह लेख आपको चरण-दर-चरण सरलीकृत AlphaGo Zero शैली गो AI बनाना सिखाता है, जिसमें गेम लॉजिक, न्यूरल नेटवर्क, MCTS और प्रशिक्षण प्रक्रिया शामिल है।

शून्य से गो AI बनाने के लिए चार मुख्य मॉड्यूल चाहिए: बोर्ड और नियम लॉजिक, न्यूरल नेटवर्क (पॉलिसी हेड और वैल्यू हेड), मोंटे कार्लो ट्री सर्च (MCTS), और सेल्फ-प्ले रिइन्फोर्समेंट लर्निंग प्रशिक्षण लूप; यह लेख सरलीकृत AlphaGo Zero आर्किटेक्चर को 9×9 बोर्ड पर कार्यान्वित करता है, जो एमेच्योर शुरुआती स्तर की खेल शक्ति वाला AI प्रशिक्षित कर सकता है।

सीखने के लक्ष्य

इस ट्यूटोरियल को पूरा करने के बाद, आपके पास एक गो AI होगा जो:

9×9 बोर्ड पर सेल्फ-प्ले कर सकता है
रिइन्फोर्समेंट लर्निंग से निरंतर सुधार कर सकता है
एमेच्योर शुरुआती स्तर की खेल शक्ति प्राप्त कर सकता है

प्रोजेक्ट आर्किटेक्चर

mini-alphago/
├── game/
│   ├── __init__.py
│   ├── board.py          # बोर्ड लॉजिक
│   ├── rules.py          # नियम कार्यान्वयन
│   └── state.py          # गेम स्थिति
├── model/
│   ├── __init__.py
│   ├── network.py        # न्यूरल नेटवर्क
│   └── features.py       # फीचर एनकोडिंग
├── mcts/
│   ├── __init__.py
│   ├── node.py           # MCTS नोड
│   └── search.py         # MCTS सर्च
├── training/
│   ├── __init__.py
│   ├── self_play.py      # सेल्फ-प्ले
│   └── trainer.py        # ट्रेनर
├── main.py               # मुख्य प्रोग्राम
└── requirements.txt

पहला चरण: बोर्ड और नियम

बोर्ड कार्यान्वयन

# game/board.py
import numpy as np

class Board:
    """गो बोर्ड"""

    EMPTY = 0
    BLACK = 1
    WHITE = 2

    def __init__(self, size=9):
        self.size = size
        self.board = np.zeros((size, size), dtype=np.int8)
        self.current_player = self.BLACK
        self.ko_point = None
        self.history = []

    def copy(self):
        """बोर्ड कॉपी करें"""
        new_board = Board(self.size)
        new_board.board = self.board.copy()
        new_board.current_player = self.current_player
        new_board.ko_point = self.ko_point
        new_board.history = self.history.copy()
        return new_board

    def get_opponent(self, player):
        """प्रतिद्वंद्वी प्राप्त करें"""
        return self.WHITE if player == self.BLACK else self.BLACK

    def is_on_board(self, x, y):
        """बोर्ड पर है या नहीं जांचें"""
        return 0 <= x < self.size and 0 <= y < self.size

    def get_neighbors(self, x, y):
        """पड़ोसी बिंदु प्राप्त करें"""
        neighbors = []
        for dx, dy in [(-1, 0), (1, 0), (0, -1), (0, 1)]:
            nx, ny = x + dx, y + dy
            if self.is_on_board(nx, ny):
                neighbors.append((nx, ny))
        return neighbors

    def get_group(self, x, y):
        """पत्थर समूह प्राप्त करें (जुड़े समान रंग के पत्थर)"""
        color = self.board[x, y]
        if color == self.EMPTY:
            return set(), set()

        group = set()
        liberties = set()
        stack = [(x, y)]

        while stack:
            cx, cy = stack.pop()
            if (cx, cy) in group:
                continue
            group.add((cx, cy))

            for nx, ny in self.get_neighbors(cx, cy):
                if self.board[nx, ny] == self.EMPTY:
                    liberties.add((nx, ny))
                elif self.board[nx, ny] == color and (nx, ny) not in group:
                    stack.append((nx, ny))

        return group, liberties

    def count_liberties(self, x, y):
        """लिबर्टी गिनें"""
        _, liberties = self.get_group(x, y)
        return len(liberties)

    def remove_group(self, group):
        """पत्थर समूह हटाएं"""
        for x, y in group:
            self.board[x, y] = self.EMPTY

    def is_legal(self, x, y, player=None):
        """वैध चाल है या नहीं जांचें"""
        if player is None:
            player = self.current_player

        # खाली बिंदु है या नहीं जांचें
        if self.board[x, y] != self.EMPTY:
            return False

        # Ko है या नहीं जांचें
        if self.ko_point == (x, y):
            return False

        # चाल सिमुलेट करें
        test_board = self.copy()
        test_board.board[x, y] = player

        # पहले कैप्चर कर सकते हैं या नहीं जांचें
        opponent = self.get_opponent(player)
        captured = []
        for nx, ny in self.get_neighbors(x, y):
            if test_board.board[nx, ny] == opponent:
                group, liberties = test_board.get_group(nx, ny)
                if len(liberties) == 0:
                    captured.extend(group)

        if captured:
            return True

        # आत्महत्या जांचें
        _, liberties = test_board.get_group(x, y)
        if len(liberties) == 0:
            return False

        return True

    def play(self, x, y):
        """चाल खेलें"""
        if not self.is_legal(x, y):
            return False

        player = self.current_player
        opponent = self.get_opponent(player)

        # चाल खेलें
        self.board[x, y] = player

        # कैप्चर करें
        captured = []
        for nx, ny in self.get_neighbors(x, y):
            if self.board[nx, ny] == opponent:
                group, liberties = self.get_group(nx, ny)
                if len(liberties) == 0:
                    captured.extend(group)
                    self.remove_group(group)

        # Ko सेट करें
        if len(captured) == 1:
            cx, cy = list(captured)[0]
            _, my_liberties = self.get_group(x, y)
            if len(my_liberties) == 1:
                self.ko_point = (cx, cy)
            else:
                self.ko_point = None
        else:
            self.ko_point = None

        # इतिहास रिकॉर्ड करें
        self.history.append((x, y, player))

        # खिलाड़ी बदलें
        self.current_player = opponent

        return True

    def pass_move(self):
        """पास (खाली चाल)"""
        self.history.append((-1, -1, self.current_player))
        self.current_player = self.get_opponent(self.current_player)
        self.ko_point = None

    def is_game_over(self):
        """गेम समाप्त है या नहीं जांचें"""
        if len(self.history) < 2:
            return False
        # दोनों पक्ष लगातार पास
        return (self.history[-1][0] == -1 and
                self.history[-2][0] == -1)

    def get_legal_moves(self):
        """सभी वैध चालें प्राप्त करें"""
        moves = []
        for x in range(self.size):
            for y in range(self.size):
                if self.is_legal(x, y):
                    moves.append((x, y))
        moves.append((-1, -1))  # pass
        return moves

    def score(self):
        """जीत/हार गणना करें (सरल एरिया स्कोरिंग)"""
        black_score = np.sum(self.board == self.BLACK)
        white_score = np.sum(self.board == self.WHITE)

        # सरल क्षेत्र गणना
        for x in range(self.size):
            for y in range(self.size):
                if self.board[x, y] == self.EMPTY:
                    neighbors = self.get_neighbors(x, y)
                    colors = set(self.board[nx, ny] for nx, ny in neighbors)
                    colors.discard(self.EMPTY)
                    if len(colors) == 1:
                        if self.BLACK in colors:
                            black_score += 1
                        else:
                            white_score += 1

        komi = 5.5 if self.size == 9 else 7.5
        return black_score - white_score - komi

दूसरा चरण: फीचर एनकोडिंग

इनपुट फीचर्स

# model/features.py
import numpy as np

def encode_board(board):
    """
    बोर्ड को न्यूरल नेटवर्क इनपुट में एनकोड करें

    फीचर प्लेन:
    0: अपने पत्थर
    1: प्रतिद्वंद्वी के पत्थर
    2: खाली बिंदु
    3: पिछली चाल स्थिति
    4: दूसरी अंतिम चाल स्थिति
    5: वैध चाल स्थान
    6: काले की बारी (सभी 1 या सभी 0)
    """
    size = board.size
    features = np.zeros((7, size, size), dtype=np.float32)

    current = board.current_player
    opponent = board.get_opponent(current)

    # मूल पत्थर स्थिति
    features[0] = (board.board == current).astype(np.float32)
    features[1] = (board.board == opponent).astype(np.float32)
    features[2] = (board.board == board.EMPTY).astype(np.float32)

    # हाल की चालें
    if len(board.history) >= 1:
        x, y, _ = board.history[-1]
        if x >= 0:
            features[3, x, y] = 1.0

    if len(board.history) >= 2:
        x, y, _ = board.history[-2]
        if x >= 0:
            features[4, x, y] = 1.0

    # वैध चालें
    for x in range(size):
        for y in range(size):
            if board.is_legal(x, y):
                features[5, x, y] = 1.0

    # किसकी बारी
    if current == board.BLACK:
        features[6] = np.ones((size, size), dtype=np.float32)

    return features

तीसरा चरण: न्यूरल नेटवर्क

ड्यूअल-हेड नेटवर्क आर्किटेक्चर

# model/network.py
import torch
import torch.nn as nn
import torch.nn.functional as F

class ResidualBlock(nn.Module):
    """रेसिड्यूअल ब्लॉक"""

    def __init__(self, channels):
        super().__init__()
        self.conv1 = nn.Conv2d(channels, channels, 3, padding=1)
        self.bn1 = nn.BatchNorm2d(channels)
        self.conv2 = nn.Conv2d(channels, channels, 3, padding=1)
        self.bn2 = nn.BatchNorm2d(channels)

    def forward(self, x):
        residual = x
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.bn2(self.conv2(x))
        x = F.relu(x + residual)
        return x


class PolicyValueNetwork(nn.Module):
    """पॉलिसी-वैल्यू ड्यूअल-हेड नेटवर्क"""

    def __init__(self, board_size=9, input_channels=7, num_filters=64, num_blocks=4):
        super().__init__()
        self.board_size = board_size

        # प्रारंभिक कन्वोल्यूशन
        self.conv_input = nn.Conv2d(input_channels, num_filters, 3, padding=1)
        self.bn_input = nn.BatchNorm2d(num_filters)

        # रेसिड्यूअल ब्लॉक
        self.residual_blocks = nn.ModuleList([
            ResidualBlock(num_filters) for _ in range(num_blocks)
        ])

        # Policy Head
        self.policy_conv = nn.Conv2d(num_filters, 2, 1)
        self.policy_bn = nn.BatchNorm2d(2)
        self.policy_fc = nn.Linear(2 * board_size * board_size, board_size * board_size + 1)

        # Value Head
        self.value_conv = nn.Conv2d(num_filters, 1, 1)
        self.value_bn = nn.BatchNorm2d(1)
        self.value_fc1 = nn.Linear(board_size * board_size, 64)
        self.value_fc2 = nn.Linear(64, 1)

    def forward(self, x):
        # साझा ट्रंक
        x = F.relu(self.bn_input(self.conv_input(x)))
        for block in self.residual_blocks:
            x = block(x)

        # Policy Head
        policy = F.relu(self.policy_bn(self.policy_conv(x)))
        policy = policy.view(policy.size(0), -1)
        policy = self.policy_fc(policy)
        policy = F.log_softmax(policy, dim=1)

        # Value Head
        value = F.relu(self.value_bn(self.value_conv(x)))
        value = value.view(value.size(0), -1)
        value = F.relu(self.value_fc1(value))
        value = torch.tanh(self.value_fc2(value))

        return policy, value


def create_network(board_size=9):
    """नेटवर्क बनाएं"""
    return PolicyValueNetwork(
        board_size=board_size,
        input_channels=7,
        num_filters=64,
        num_blocks=4
    )

चौथा चरण: MCTS कार्यान्वयन

नोड क्लास

# mcts/node.py
import numpy as np

class MCTSNode:
    """MCTS नोड"""

    def __init__(self, prior=0.0):
        self.visit_count = 0
        self.value_sum = 0.0
        self.prior = prior
        self.children = {}

    @property
    def value(self):
        if self.visit_count == 0:
            return 0.0
        return self.value_sum / self.visit_count

    def expand(self, policy, legal_moves):
        """नोड विस्तार करें"""
        for move in legal_moves:
            if move not in self.children:
                idx = move[0] * 9 + move[1] if move[0] >= 0 else 81
                self.children[move] = MCTSNode(prior=np.exp(policy[idx]))

    def select_child(self, c_puct=1.5):
        """PUCT से चाइल्ड नोड चुनें"""
        best_score = -float('inf')
        best_move = None
        best_child = None

        sqrt_total = np.sqrt(max(1, self.visit_count))

        for move, child in self.children.items():
            if child.visit_count > 0:
                q_value = child.value
            else:
                q_value = 0.0

            u_value = c_puct * child.prior * sqrt_total / (1 + child.visit_count)
            score = q_value + u_value

            if score > best_score:
                best_score = score
                best_move = move
                best_child = child

        return best_move, best_child

सर्च कार्यान्वयन

# mcts/search.py
import numpy as np
import torch
from .node import MCTSNode

class MCTS:
    """मोंटे कार्लो ट्री सर्च"""

    def __init__(self, network, board_size=9, num_simulations=100, c_puct=1.5):
        self.network = network
        self.board_size = board_size
        self.num_simulations = num_simulations
        self.c_puct = c_puct

    def search(self, board, add_noise=False):
        """MCTS सर्च निष्पादित करें"""
        root = MCTSNode()

        # रूट नोड मूल्यांकन
        policy, value = self.evaluate(board)
        legal_moves = board.get_legal_moves()
        root.expand(policy, legal_moves)

        # Dirichlet नॉइज़ जोड़ें (प्रशिक्षण के दौरान)
        if add_noise:
            self.add_dirichlet_noise(root)

        # सिमुलेशन निष्पादित करें
        for _ in range(self.num_simulations):
            node = root
            scratch_board = board.copy()
            path = [node]

            # Selection
            while node.children and scratch_board.get_legal_moves():
                move, node = node.select_child(self.c_puct)
                if move[0] >= 0:
                    scratch_board.play(move[0], move[1])
                else:
                    scratch_board.pass_move()
                path.append(node)

                if scratch_board.is_game_over():
                    break

            # Expansion + Evaluation
            if not scratch_board.is_game_over():
                policy, value = self.evaluate(scratch_board)
                legal_moves = scratch_board.get_legal_moves()
                if legal_moves:
                    node.expand(policy, legal_moves)

            # सर्च प्रारंभ बिंदु से मूल्य गणना
            if scratch_board.is_game_over():
                score = scratch_board.score()
                value = 1.0 if score > 0 else (-1.0 if score < 0 else 0.0)
                if board.current_player != scratch_board.BLACK:
                    value = -value

            # Backpropagation
            for node in reversed(path):
                node.visit_count += 1
                node.value_sum += value
                value = -value

        return root

    def evaluate(self, board):
        """न्यूरल नेटवर्क से मूल्यांकन"""
        from model.features import encode_board

        features = encode_board(board)
        features = torch.tensor(features).unsqueeze(0)

        self.network.eval()
        with torch.no_grad():
            policy, value = self.network(features)

        return policy[0].numpy(), value[0].item()

    def add_dirichlet_noise(self, root, alpha=0.3, epsilon=0.25):
        """अन्वेषण नॉइज़ जोड़ें"""
        noise = np.random.dirichlet([alpha] * len(root.children))
        for i, child in enumerate(root.children.values()):
            child.prior = (1 - epsilon) * child.prior + epsilon * noise[i]

    def get_policy(self, root, temperature=1.0):
        """सर्च परिणाम से पॉलिसी प्राप्त करें"""
        visits = np.zeros(self.board_size ** 2 + 1)

        for move, child in root.children.items():
            idx = move[0] * self.board_size + move[1] if move[0] >= 0 else self.board_size ** 2
            visits[idx] = child.visit_count

        if temperature == 0:
            policy = np.zeros_like(visits)
            policy[np.argmax(visits)] = 1.0
        else:
            visits = visits ** (1 / temperature)
            policy = visits / visits.sum()

        return policy

    def select_move(self, root, temperature=1.0):
        """चाल चुनें"""
        policy = self.get_policy(root, temperature)
        idx = np.random.choice(len(policy), p=policy)

        if idx == self.board_size ** 2:
            return (-1, -1)
        else:
            return (idx // self.board_size, idx % self.board_size)

पांचवां चरण: सेल्फ-प्ले

# training/self_play.py
import numpy as np
from game.board import Board
from model.features import encode_board

def self_play_game(mcts, temperature=1.0, temp_threshold=30):
    """एक सेल्फ-प्ले गेम खेलें"""
    board = Board(size=9)
    game_history = []

    move_count = 0
    while not board.is_game_over() and move_count < 200:
        # MCTS सर्च
        root = mcts.search(board, add_noise=True)

        # पॉलिसी प्राप्त करें
        temp = temperature if move_count < temp_threshold else 0.0
        policy = mcts.get_policy(root, temp)

        # प्रशिक्षण डेटा रिकॉर्ड करें
        features = encode_board(board)
        game_history.append({
            'features': features,
            'policy': policy,
            'player': board.current_player
        })

        # चाल चुनें और खेलें
        move = mcts.select_move(root, temp)
        if move[0] >= 0:
            board.play(move[0], move[1])
        else:
            board.pass_move()

        move_count += 1

    # जीत/हार गणना
    score = board.score()
    winner = Board.BLACK if score > 0 else (Board.WHITE if score < 0 else 0)

    # मूल्य चिह्नित करें
    for data in game_history:
        if winner == 0:
            data['value'] = 0.0
        elif data['player'] == winner:
            data['value'] = 1.0
        else:
            data['value'] = -1.0

    return game_history


def generate_training_data(mcts, num_games=100):
    """प्रशिक्षण डेटा उत्पन्न करें"""
    all_data = []

    for i in range(num_games):
        print(f"Self-play game {i+1}/{num_games}")
        game_data = self_play_game(mcts)
        all_data.extend(game_data)

    return all_data

छठा चरण: ट्रेनर

# training/trainer.py
import torch
import torch.nn.functional as F
import numpy as np
from torch.utils.data import DataLoader, TensorDataset

class Trainer:
    """ट्रेनर"""

    def __init__(self, network, learning_rate=0.001):
        self.network = network
        self.optimizer = torch.optim.Adam(network.parameters(), lr=learning_rate)

    def train_step(self, batch):
        """एक प्रशिक्षण चरण"""
        features, target_policy, target_value = batch

        self.network.train()
        self.optimizer.zero_grad()

        # फॉरवर्ड प्रोपेगेशन
        pred_policy, pred_value = self.network(features)

        # लॉस गणना
        policy_loss = F.kl_div(pred_policy, target_policy, reduction='batchmean')
        value_loss = F.mse_loss(pred_value.squeeze(), target_value)
        total_loss = policy_loss + value_loss

        # बैकप्रोपेगेशन
        total_loss.backward()
        self.optimizer.step()

        return {
            'total_loss': total_loss.item(),
            'policy_loss': policy_loss.item(),
            'value_loss': value_loss.item()
        }

    def train_epoch(self, data, batch_size=32):
        """एक epoch प्रशिक्षित करें"""
        # डेटा तैयार करें
        features = np.array([d['features'] for d in data])
        policies = np.array([d['policy'] for d in data])
        values = np.array([d['value'] for d in data])

        features = torch.tensor(features, dtype=torch.float32)
        policies = torch.tensor(policies, dtype=torch.float32)
        values = torch.tensor(values, dtype=torch.float32)

        dataset = TensorDataset(features, policies, values)
        loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

        total_losses = []
        for batch in loader:
            losses = self.train_step(batch)
            total_losses.append(losses['total_loss'])

        return np.mean(total_losses)

    def save(self, path):
        """मॉडल सेव करें"""
        torch.save(self.network.state_dict(), path)

    def load(self, path):
        """मॉडल लोड करें"""
        self.network.load_state_dict(torch.load(path))

सातवां चरण: मुख्य प्रोग्राम

# main.py
from model.network import create_network
from mcts.search import MCTS
from training.self_play import generate_training_data
from training.trainer import Trainer

def main():
    # नेटवर्क बनाएं
    network = create_network(board_size=9)
    mcts = MCTS(network, board_size=9, num_simulations=100)
    trainer = Trainer(network)

    # प्रशिक्षण लूप
    num_iterations = 100
    games_per_iteration = 50
    epochs_per_iteration = 10

    for iteration in range(num_iterations):
        print(f"\n=== Iteration {iteration + 1}/{num_iterations} ===")

        # सेल्फ-प्ले
        print("Generating self-play games...")
        training_data = generate_training_data(mcts, num_games=games_per_iteration)

        # प्रशिक्षण
        print("Training...")
        for epoch in range(epochs_per_iteration):
            loss = trainer.train_epoch(training_data)
            print(f"  Epoch {epoch + 1}: loss = {loss:.4f}")

        # सेव करें
        trainer.save(f"model_iter_{iteration + 1}.pt")

    print("\nTraining complete!")


if __name__ == "__main__":
    main()

चलाएं और परीक्षण करें

डिपेंडेंसी इंस्टॉल करें

pip install torch numpy

प्रशिक्षण चलाएं

python main.py

अपेक्षित आउटपुट

=== Iteration 1/100 ===
Generating self-play games...
Self-play game 1/50
Self-play game 2/50
...
Training...
  Epoch 1: loss = 2.3456
  Epoch 2: loss = 1.8765
  ...

सुधार सुझाव

अल्पकालिक सुधार

सुधार आइटम	विवरण
रेसिड्यूअल ब्लॉक बढ़ाएं	4 → 8 → 16 ब्लॉक
चैनल बढ़ाएं	64 → 128 → 256
सिमुलेशन बढ़ाएं	100 → 400 → 800
बड़ा प्रशिक्षण सेट	50 → 200 → 1000 गेम/इटरेशन

दीर्घकालिक सुधार

19×19 बोर्ड समर्थन
सहायक प्रशिक्षण लक्ष्य जोड़ें (क्षेत्र पूर्वानुमान)
पैरेलल सेल्फ-प्ले कार्यान्वित करें
GPU त्वरण जोड़ें

आगे पढ़ें

न्यूरल नेटवर्क आर्किटेक्चर विस्तृत व्याख्या — गहरा नेटवर्क डिज़ाइन
MCTS कार्यान्वयन विवरण — उन्नत सर्च तकनीकें
KataGo प्रशिक्षण तंत्र विश्लेषण — प्रोडक्शन-ग्रेड प्रशिक्षण प्रणाली
मुख्य पेपर गाइड — सैद्धांतिक आधार

प्रोजेक्ट आर्किटेक्चर​

पहला चरण: बोर्ड और नियम​

बोर्ड कार्यान्वयन​

दूसरा चरण: फीचर एनकोडिंग​

इनपुट फीचर्स​

तीसरा चरण: न्यूरल नेटवर्क​

ड्यूअल-हेड नेटवर्क आर्किटेक्चर​

चौथा चरण: MCTS कार्यान्वयन​

नोड क्लास​

सर्च कार्यान्वयन​

पांचवां चरण: सेल्फ-प्ले​

छठा चरण: ट्रेनर​

सातवां चरण: मुख्य प्रोग्राम​

चलाएं और परीक्षण करें​

डिपेंडेंसी इंस्टॉल करें​

प्रशिक्षण चलाएं​

अपेक्षित आउटपुट​

सुधार सुझाव​

अल्पकालिक सुधार​

दीर्घकालिक सुधार​

आगे पढ़ें​