Skip to content

Instantly share code, notes, and snippets.

@travishsu
Last active April 20, 2025 08:33
Show Gist options
  • Save travishsu/b7d95edba9796771e5856aaf0bbf5766 to your computer and use it in GitHub Desktop.
Save travishsu/b7d95edba9796771e5856aaf0bbf5766 to your computer and use it in GitHub Desktop.
Tic Tac Toe Python implementation rewrite from https://github.com/antirez/ttt-rl
# Tic Tac Toe Python implementation
import random
from math import exp
from tqdm import tqdm
NN_INPUT_SIZE = 18
NN_HIDDEN_SIZE = 100
NN_OUTPUT_SIZE = 9
LEARNING_RATE = 0.1
class NeuralNetwork:
def __init__(self, input_size: int, hidden_size: int, output_size: int):
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.weights_ih = [[0.0] * hidden_size for _ in range(input_size)]
self.weights_ho = [[0.0] * output_size for _ in range(hidden_size)]
self.bias_h = [0.0] * hidden_size
self.bias_o = [0.0] * output_size
self.random_init()
def random_init(self, rand_min=-0.5, rand_max=0.5):
for i in range(self.input_size):
for j in range(self.hidden_size):
self.weights_ih[i][j] = random.uniform(rand_min, rand_max)
for i in range(self.hidden_size):
for j in range(self.output_size):
self.weights_ho[i][j] = random.uniform(rand_min, rand_max)
for i in range(self.hidden_size):
self.bias_h[i] = random.uniform(rand_min, rand_max)
for i in range(self.output_size):
self.bias_o[i] = random.uniform(rand_min, rand_max)
def relu(self, x):
return max(0, x)
def relu_derivative(self, x):
return 1. if x > 0. else 0.
def abs(self, x):
return x if x >= 0 else -x
def softmax(self, x):
max_val = max(x)
exp_x = [exp(i - max_val) for i in x]
sum_exp_x = sum(exp_x)
if sum_exp_x > 0:
return [i / sum_exp_x for i in exp_x]
else:
return [1 / len(x)] * len(x)
def forward(self, inputs):
self.inputs = inputs
self.hidden = [0.0] * self.hidden_size
for i in range(self.hidden_size):
sum = self.bias_h[i]
for j in range(self.input_size):
sum += inputs[j] * self.weights_ih[j][i]
self.hidden[i] = self.relu(sum)
self.raw_logits = [0.0] * self.output_size
for i in range(self.output_size):
sum = self.bias_o[i]
for j in range(self.hidden_size):
sum += self.hidden[j] * self.weights_ho[j][i]
self.raw_logits[i] = sum
self.outputs = self.softmax(self.raw_logits)
return self.outputs
def backward(self, target_probs, learning_rate, reward_scaling=1.0):
# Calculate output layer deltas
output_deltas = []
for i in range(self.output_size):
delta = self.outputs[i] - target_probs[i] # differential of softmax
output_deltas.append(delta * self.abs(reward_scaling))
# Calculate hidden layer deltas
hidden_deltas = []
for i in range(self.hidden_size):
hidden_deltas.append(0.0)
for j in range(self.output_size):
hidden_deltas[i] += output_deltas[j] * self.weights_ho[i][j]
hidden_deltas[i] *= self.relu_derivative(self.hidden[i])
# Update weights and biases
for i in range(self.hidden_size):
for j in range(self.output_size):
self.weights_ho[i][j] -= learning_rate * output_deltas[j] * self.hidden[i]
for j in range(self.output_size):
self.bias_o[j] -= learning_rate * output_deltas[j]
for i in range(self.input_size):
for j in range(self.hidden_size):
self.weights_ih[i][j] -= learning_rate * hidden_deltas[j] * self.inputs[i]
for j in range(self.hidden_size):
self.bias_h[j] -= learning_rate * hidden_deltas[j]
self.output_delta = output_deltas
self.hidden_delta = hidden_deltas
def learn_from_game(self, move_history, winner, even=True):
nn_symbol = 'O' if even else 'X'
if winner == 'T':
reward = 0.3
elif winner == nn_symbol:
reward = 1.0
else:
reward = -2.0
state = GameState()
for idx, move in enumerate(move_history):
# skip if this isn't the NN's turn (0: 1st turn (odd), 1: 2nd turn (even))
if (even and idx % 2 != 1) or (not even and idx % 2 != 0):
continue
# recreate board state before this move was made
state.reset()
for i in range(idx):
state.make_move(move_history[i])
nn.forward(state.as_input())
# 越早的 move 影響越小
move_importance = 0.5 + 0.5 * float(idx / len(move_history))
scaled_reward = reward * move_importance
target_probs = [0.0] * self.output_size
if scaled_reward >= 0:
target_probs[move] = 1
else:
# For negative reward, distribute probability to OTHER
# valid moves, which is conceptually the same as discouraging
# the move that we want to discourage.
num_valid_moves = state.board.count('.') - 1
for i in range(self.output_size):
if state.board[i] == '.' and i != move:
target_probs[i] = 1 / num_valid_moves
self.backward(target_probs, LEARNING_RATE, reward_scaling=scaled_reward)
# print(f"Learning from move {move} with reward {scaled_reward}")
class GameState:
def __init__(self, first_player=0):
self.reset(first_player=first_player)
def reset(self, first_player=0):
self.board = ['.', '.', '.',
'.', '.', '.',
'.', '.', '.']
self.current_player = first_player # 0 for X (Human), 1 for O (NN)
self.move_history = []
def display(self):
print("Current board:")
for i in range(3):
print(" ".join(self.board[i*3:(i+1)*3]))
print(f"Current player: {'X' if self.current_player == 1 else 'O'}\n")
def as_input(self):
'''
0, 1, 2
3, 4, 5
6, 7, 8
'''
input_vector = []
for i in range(9):
if self.board[i] == '.':
input_vector.extend([0, 0])
elif self.board[i] == 'X':
input_vector.extend([1, 0])
else: # 'O'
input_vector.extend([0, 1])
return input_vector
def winner(self):
# Check rows
for i in range(3):
if self.board[i*3] != '.' \
and self.board[i*3] == self.board[i*3+1] \
and self.board[i*3] == self.board[i*3+2]:
return self.board[i*3]
# Check columns
for i in range(3):
if self.board[i] != '.' \
and self.board[i] == self.board[i+3] \
and self.board[i] == self.board[i+6]:
return self.board[i]
# Check diagonals
if self.board[0] != '.' \
and self.board[0] == self.board[4] \
and self.board[0] == self.board[8]:
return self.board[0]
if self.board[2] != '.' \
and self.board[2] == self.board[4] \
and self.board[2] == self.board[6]:
return self.board[2]
# Check for tie
if '.' not in self.board:
return 'T' # Tie
return None
def make_move(self, move: int):
current_symbol = 'O' if self.current_player == 1 else 'X'
if self.board[move] == '.':
self.board[move] = current_symbol
self.current_player = self.current_player ^ 1
self.move_history.append(move)
else:
raise ValueError("Invalid move")
def get_computer_move(self, nn: NeuralNetwork, display_probs=False, move=True):
probs = nn.forward(self.as_input())
highest_prob = -1
highest_prob_idx = -1
best_move = None
best_legal_prob = -1
for i in range(9):
if probs[i] > highest_prob:
highest_prob = probs[i]
highest_prob_idx = i
if self.board[i] == '.':
if best_move is None or probs[i] > best_legal_prob:
best_move = i
best_legal_prob = probs[i]
# That's just for debugging. It's interesting to show to user
# in the first iterations of the game, since you can see how initially
# the net picks illegal moves as best, and so forth.
if display_probs:
print("Probabilities of each move:")
for i in range(3):
for j in range(3):
pos = i * 3 + j
# Print probability as percentage
print(f"{probs[pos] * 100:.1f}", end="")
# Add markers.
if pos == highest_prob_idx:
print("*", end="")
elif pos == best_move:
print("#", end="")
print(" ", end="")
print("\n")
print(f"Sum of all probabilities: {sum(probs)}")
if move:
if self.board[best_move] == '.':
self.make_move(best_move)
else:
raise ValueError("Invalid move")
return best_move
def get_random_move(self, move=True):
if '.' not in self.board:
raise ValueError("No legal moves left")
while True:
random_move = random.randint(0, 8)
if self.board[random_move] == '.':
if move:
self.make_move(random_move)
return random_move
def play_random_game(nn=None, display=False):
game = GameState()
while not game.winner():
game.get_random_move()
if display:
game.display()
winner = game.winner()
if winner is not None:
break
if nn is not None:
game.get_computer_move(nn, display_probs=display)
if display:
game.display()
winner = game.winner()
if winner is not None:
break
if display:
if winner == 'X':
print("X wins!")
elif winner == 'O':
print("O wins!")
else:
print("It's a tie!")
return game, winner
def play_nn_vs_nn(nn_x, nn_o, display=False):
game = GameState()
while not game.winner():
if game.current_player == 1: # Player X's turn
game.get_computer_move(nn_x, display_probs=display)
else: # Player O's turn
game.get_computer_move(nn_o, display_probs=display)
if display:
game.display()
winner = game.winner()
if winner is not None:
break
if display:
if winner == 'X':
print("X wins!")
elif winner == 'O':
print("O wins!")
else:
print("It's a tie!")
game.display()
return game, winner
nn = NeuralNetwork(NN_INPUT_SIZE, NN_HIDDEN_SIZE, NN_OUTPUT_SIZE)
with tqdm(total=150000) as t:
wins, losses, ties = 0, 0, 0
for _ in range(150000):
game, winner = play_random_game(nn=nn)
t.update(1)
if winner == 'O':
wins += 1
elif winner == 'X':
losses += 1
else:
ties += 1
game_behind = wins - losses
nn.learn_from_game(game.move_history, winner, even=True)
t.set_postfix(r_wins=wins/(_+1), r_losses=losses/(_+1), r_ties=ties/(_+1), game_behind=game_behind)
t.refresh()
game = GameState()
while True:
game.display()
move = input("Enter your move (0-8) or exit (q): ")
if move == 'q':
break
game.make_move(int(move))
if game.winner() is not None:
break
game.get_computer_move(nn, display_probs=True)
if game.winner() is not None:
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment