Last active
April 20, 2025 08:33
-
-
Save travishsu/b7d95edba9796771e5856aaf0bbf5766 to your computer and use it in GitHub Desktop.
Tic Tac Toe Python implementation rewrite from https://github.com/antirez/ttt-rl
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Tic Tac Toe Python implementation | |
import random | |
from math import exp | |
from tqdm import tqdm | |
NN_INPUT_SIZE = 18 | |
NN_HIDDEN_SIZE = 100 | |
NN_OUTPUT_SIZE = 9 | |
LEARNING_RATE = 0.1 | |
class NeuralNetwork: | |
def __init__(self, input_size: int, hidden_size: int, output_size: int): | |
self.input_size = input_size | |
self.hidden_size = hidden_size | |
self.output_size = output_size | |
self.weights_ih = [[0.0] * hidden_size for _ in range(input_size)] | |
self.weights_ho = [[0.0] * output_size for _ in range(hidden_size)] | |
self.bias_h = [0.0] * hidden_size | |
self.bias_o = [0.0] * output_size | |
self.random_init() | |
def random_init(self, rand_min=-0.5, rand_max=0.5): | |
for i in range(self.input_size): | |
for j in range(self.hidden_size): | |
self.weights_ih[i][j] = random.uniform(rand_min, rand_max) | |
for i in range(self.hidden_size): | |
for j in range(self.output_size): | |
self.weights_ho[i][j] = random.uniform(rand_min, rand_max) | |
for i in range(self.hidden_size): | |
self.bias_h[i] = random.uniform(rand_min, rand_max) | |
for i in range(self.output_size): | |
self.bias_o[i] = random.uniform(rand_min, rand_max) | |
def relu(self, x): | |
return max(0, x) | |
def relu_derivative(self, x): | |
return 1. if x > 0. else 0. | |
def abs(self, x): | |
return x if x >= 0 else -x | |
def softmax(self, x): | |
max_val = max(x) | |
exp_x = [exp(i - max_val) for i in x] | |
sum_exp_x = sum(exp_x) | |
if sum_exp_x > 0: | |
return [i / sum_exp_x for i in exp_x] | |
else: | |
return [1 / len(x)] * len(x) | |
def forward(self, inputs): | |
self.inputs = inputs | |
self.hidden = [0.0] * self.hidden_size | |
for i in range(self.hidden_size): | |
sum = self.bias_h[i] | |
for j in range(self.input_size): | |
sum += inputs[j] * self.weights_ih[j][i] | |
self.hidden[i] = self.relu(sum) | |
self.raw_logits = [0.0] * self.output_size | |
for i in range(self.output_size): | |
sum = self.bias_o[i] | |
for j in range(self.hidden_size): | |
sum += self.hidden[j] * self.weights_ho[j][i] | |
self.raw_logits[i] = sum | |
self.outputs = self.softmax(self.raw_logits) | |
return self.outputs | |
def backward(self, target_probs, learning_rate, reward_scaling=1.0): | |
# Calculate output layer deltas | |
output_deltas = [] | |
for i in range(self.output_size): | |
delta = self.outputs[i] - target_probs[i] # differential of softmax | |
output_deltas.append(delta * self.abs(reward_scaling)) | |
# Calculate hidden layer deltas | |
hidden_deltas = [] | |
for i in range(self.hidden_size): | |
hidden_deltas.append(0.0) | |
for j in range(self.output_size): | |
hidden_deltas[i] += output_deltas[j] * self.weights_ho[i][j] | |
hidden_deltas[i] *= self.relu_derivative(self.hidden[i]) | |
# Update weights and biases | |
for i in range(self.hidden_size): | |
for j in range(self.output_size): | |
self.weights_ho[i][j] -= learning_rate * output_deltas[j] * self.hidden[i] | |
for j in range(self.output_size): | |
self.bias_o[j] -= learning_rate * output_deltas[j] | |
for i in range(self.input_size): | |
for j in range(self.hidden_size): | |
self.weights_ih[i][j] -= learning_rate * hidden_deltas[j] * self.inputs[i] | |
for j in range(self.hidden_size): | |
self.bias_h[j] -= learning_rate * hidden_deltas[j] | |
self.output_delta = output_deltas | |
self.hidden_delta = hidden_deltas | |
def learn_from_game(self, move_history, winner, even=True): | |
nn_symbol = 'O' if even else 'X' | |
if winner == 'T': | |
reward = 0.3 | |
elif winner == nn_symbol: | |
reward = 1.0 | |
else: | |
reward = -2.0 | |
state = GameState() | |
for idx, move in enumerate(move_history): | |
# skip if this isn't the NN's turn (0: 1st turn (odd), 1: 2nd turn (even)) | |
if (even and idx % 2 != 1) or (not even and idx % 2 != 0): | |
continue | |
# recreate board state before this move was made | |
state.reset() | |
for i in range(idx): | |
state.make_move(move_history[i]) | |
nn.forward(state.as_input()) | |
# 越早的 move 影響越小 | |
move_importance = 0.5 + 0.5 * float(idx / len(move_history)) | |
scaled_reward = reward * move_importance | |
target_probs = [0.0] * self.output_size | |
if scaled_reward >= 0: | |
target_probs[move] = 1 | |
else: | |
# For negative reward, distribute probability to OTHER | |
# valid moves, which is conceptually the same as discouraging | |
# the move that we want to discourage. | |
num_valid_moves = state.board.count('.') - 1 | |
for i in range(self.output_size): | |
if state.board[i] == '.' and i != move: | |
target_probs[i] = 1 / num_valid_moves | |
self.backward(target_probs, LEARNING_RATE, reward_scaling=scaled_reward) | |
# print(f"Learning from move {move} with reward {scaled_reward}") | |
class GameState: | |
def __init__(self, first_player=0): | |
self.reset(first_player=first_player) | |
def reset(self, first_player=0): | |
self.board = ['.', '.', '.', | |
'.', '.', '.', | |
'.', '.', '.'] | |
self.current_player = first_player # 0 for X (Human), 1 for O (NN) | |
self.move_history = [] | |
def display(self): | |
print("Current board:") | |
for i in range(3): | |
print(" ".join(self.board[i*3:(i+1)*3])) | |
print(f"Current player: {'X' if self.current_player == 1 else 'O'}\n") | |
def as_input(self): | |
''' | |
0, 1, 2 | |
3, 4, 5 | |
6, 7, 8 | |
''' | |
input_vector = [] | |
for i in range(9): | |
if self.board[i] == '.': | |
input_vector.extend([0, 0]) | |
elif self.board[i] == 'X': | |
input_vector.extend([1, 0]) | |
else: # 'O' | |
input_vector.extend([0, 1]) | |
return input_vector | |
def winner(self): | |
# Check rows | |
for i in range(3): | |
if self.board[i*3] != '.' \ | |
and self.board[i*3] == self.board[i*3+1] \ | |
and self.board[i*3] == self.board[i*3+2]: | |
return self.board[i*3] | |
# Check columns | |
for i in range(3): | |
if self.board[i] != '.' \ | |
and self.board[i] == self.board[i+3] \ | |
and self.board[i] == self.board[i+6]: | |
return self.board[i] | |
# Check diagonals | |
if self.board[0] != '.' \ | |
and self.board[0] == self.board[4] \ | |
and self.board[0] == self.board[8]: | |
return self.board[0] | |
if self.board[2] != '.' \ | |
and self.board[2] == self.board[4] \ | |
and self.board[2] == self.board[6]: | |
return self.board[2] | |
# Check for tie | |
if '.' not in self.board: | |
return 'T' # Tie | |
return None | |
def make_move(self, move: int): | |
current_symbol = 'O' if self.current_player == 1 else 'X' | |
if self.board[move] == '.': | |
self.board[move] = current_symbol | |
self.current_player = self.current_player ^ 1 | |
self.move_history.append(move) | |
else: | |
raise ValueError("Invalid move") | |
def get_computer_move(self, nn: NeuralNetwork, display_probs=False, move=True): | |
probs = nn.forward(self.as_input()) | |
highest_prob = -1 | |
highest_prob_idx = -1 | |
best_move = None | |
best_legal_prob = -1 | |
for i in range(9): | |
if probs[i] > highest_prob: | |
highest_prob = probs[i] | |
highest_prob_idx = i | |
if self.board[i] == '.': | |
if best_move is None or probs[i] > best_legal_prob: | |
best_move = i | |
best_legal_prob = probs[i] | |
# That's just for debugging. It's interesting to show to user | |
# in the first iterations of the game, since you can see how initially | |
# the net picks illegal moves as best, and so forth. | |
if display_probs: | |
print("Probabilities of each move:") | |
for i in range(3): | |
for j in range(3): | |
pos = i * 3 + j | |
# Print probability as percentage | |
print(f"{probs[pos] * 100:.1f}", end="") | |
# Add markers. | |
if pos == highest_prob_idx: | |
print("*", end="") | |
elif pos == best_move: | |
print("#", end="") | |
print(" ", end="") | |
print("\n") | |
print(f"Sum of all probabilities: {sum(probs)}") | |
if move: | |
if self.board[best_move] == '.': | |
self.make_move(best_move) | |
else: | |
raise ValueError("Invalid move") | |
return best_move | |
def get_random_move(self, move=True): | |
if '.' not in self.board: | |
raise ValueError("No legal moves left") | |
while True: | |
random_move = random.randint(0, 8) | |
if self.board[random_move] == '.': | |
if move: | |
self.make_move(random_move) | |
return random_move | |
def play_random_game(nn=None, display=False): | |
game = GameState() | |
while not game.winner(): | |
game.get_random_move() | |
if display: | |
game.display() | |
winner = game.winner() | |
if winner is not None: | |
break | |
if nn is not None: | |
game.get_computer_move(nn, display_probs=display) | |
if display: | |
game.display() | |
winner = game.winner() | |
if winner is not None: | |
break | |
if display: | |
if winner == 'X': | |
print("X wins!") | |
elif winner == 'O': | |
print("O wins!") | |
else: | |
print("It's a tie!") | |
return game, winner | |
def play_nn_vs_nn(nn_x, nn_o, display=False): | |
game = GameState() | |
while not game.winner(): | |
if game.current_player == 1: # Player X's turn | |
game.get_computer_move(nn_x, display_probs=display) | |
else: # Player O's turn | |
game.get_computer_move(nn_o, display_probs=display) | |
if display: | |
game.display() | |
winner = game.winner() | |
if winner is not None: | |
break | |
if display: | |
if winner == 'X': | |
print("X wins!") | |
elif winner == 'O': | |
print("O wins!") | |
else: | |
print("It's a tie!") | |
game.display() | |
return game, winner | |
nn = NeuralNetwork(NN_INPUT_SIZE, NN_HIDDEN_SIZE, NN_OUTPUT_SIZE) | |
with tqdm(total=150000) as t: | |
wins, losses, ties = 0, 0, 0 | |
for _ in range(150000): | |
game, winner = play_random_game(nn=nn) | |
t.update(1) | |
if winner == 'O': | |
wins += 1 | |
elif winner == 'X': | |
losses += 1 | |
else: | |
ties += 1 | |
game_behind = wins - losses | |
nn.learn_from_game(game.move_history, winner, even=True) | |
t.set_postfix(r_wins=wins/(_+1), r_losses=losses/(_+1), r_ties=ties/(_+1), game_behind=game_behind) | |
t.refresh() | |
game = GameState() | |
while True: | |
game.display() | |
move = input("Enter your move (0-8) or exit (q): ") | |
if move == 'q': | |
break | |
game.make_move(int(move)) | |
if game.winner() is not None: | |
break | |
game.get_computer_move(nn, display_probs=True) | |
if game.winner() is not None: | |
break |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment