Minmax Tic Tac Toe

#!/usr/bin/env python3
from math import inf as infinity
from random import choice
import platform
import time
from os import system

HUMAN = -1
COMP = +1
board = [
    [0, 0, 0],
    [0, 0, 0],
    [0, 0, 0],
]


def evaluate(state):
    """
    Function to heuristic evaluation of state.
    :param state: the state of the current board
    :return: +1 if the computer wins; -1 if the human wins; 0 draw
    """
    if wins(state, COMP):
        score = +1
    elif wins(state, HUMAN):
        score = -1
    else:
        score = 0

    return score


def wins(state, player):
    """
    This function tests if a specific player wins. Possibilities:
    * Three rows    [X X X] or [O O O]
    * Three cols    [X X X] or [O O O]
    * Two diagonals [X X X] or [O O O]
    :param state: the state of the current board
    :param player: a human or a computer
    :return: True if the player wins
    """
    win_state = [
        [state[0][0], state[0][1], state[0][2]],
        [state[1][0], state[1][1], state[1][2]],
        [state[2][0], state[2][1], state[2][2]],
        [state[0][0], state[1][0], state[2][0]],
        [state[0][1], state[1][1], state[2][1]],
        [state[0][2], state[1][2], state[2][2]],
        [state[0][0], state[1][1], state[2][2]],
        [state[2][0], state[1][1], state[0][2]],
    ]
    if [player, player, player] in win_state:
        return True
    else:
        return False


def game_over(state):
    """
    This function test if the human or computer wins
    :param state: the state of the current board
    :return: True if the human or computer wins
    """
    return wins(state, HUMAN) or wins(state, COMP)


def empty_cells(state):
    """
    Each empty cell will be added into cells' list
    :param state: the state of the current board
    :return: a list of empty cells
    """
    cells = []

    for x, row in enumerate(state):
        for y, cell in enumerate(row):
            if cell == 0:
                cells.append([x, y])

    return cells


def valid_move(x, y):
    """
    A move is valid if the chosen cell is empty
    :param x: X coordinate
    :param y: Y coordinate
    :return: True if the board[x][y] is empty
    """
    if [x, y] in empty_cells(board):
        return True
    else:
        return False


def set_move(x, y, player):
    """
    Set the move on board, if the coordinates are valid
    :param x: X coordinate
    :param y: Y coordinate
    :param player: the current player
    """
    if valid_move(x, y):
        board[x][y] = player
        return True
    else:
        return False


def minimax(state, depth, player):
    """
    AI function that choice the best move
    :param state: current state of the board
    :param depth: node index in the tree (0 <= depth <= 9),
    but never nine in this case (see iaturn() function)
    :param player: an human or a computer
    :return: a list with [the best row, best col, best score]
    """
    if player == COMP:
        best = [-1, -1, -infinity]
    else:
        best = [-1, -1, +infinity]

    if depth == 0 or game_over(state):
        score = evaluate(state)
        return [-1, -1, score]

    for cell in empty_cells(state):
        x, y = cell[0], cell[1]
        state[x][y] = player
        score = minimax(state, depth - 1, -player)
        state[x][y] = 0
        score[0], score[1] = x, y

        if player == COMP:
            if score[2] > best[2]:
                best = score  # max value
        else:
            if score[2] < best[2]:
                best = score  # min value

    return best


def clean():
    """
    Clears the console
    """
    os_name = platform.system().lower()
    if 'windows' in os_name:
        system('cls')
    else:
        system('clear')


def render(state, c_choice, h_choice):
    """
    Print the board on console
    :param state: current state of the board
    """

    chars = {
        -1: h_choice,
        +1: c_choice,
        0: ' '
    }
    str_line = '---------------'

    print('\n' + str_line)
    for row in state:
        for cell in row:
            symbol = chars[cell]
            print(f'| {symbol} |', end='')
        print('\n' + str_line)


def ai_turn(c_choice, h_choice):
    """
    It calls the minimax function if the depth < 9,
    else it choices a random coordinate.
    :param c_choice: computer's choice X or O
    :param h_choice: human's choice X or O
    :return:
    """
    depth = len(empty_cells(board))
    if depth == 0 or game_over(board):
        return

    clean()
    print(f'Computer turn [{c_choice}]')
    render(board, c_choice, h_choice)

    if depth == 9:
        x = choice([0, 1, 2])
        y = choice([0, 1, 2])
    else:
        move = minimax(board, depth, COMP)
        x, y = move[0], move[1]

    set_move(x, y, COMP)
    time.sleep(1)


def human_turn(c_choice, h_choice):
    """
    The Human plays choosing a valid move.
    :param c_choice: computer's choice X or O
    :param h_choice: human's choice X or O
    :return:
    """
    depth = len(empty_cells(board))
    if depth == 0 or game_over(board):
        return

    # Dictionary of valid moves
    move = -1
    moves = {
        1: [0, 0], 2: [0, 1], 3: [0, 2],
        4: [1, 0], 5: [1, 1], 6: [1, 2],
        7: [2, 0], 8: [2, 1], 9: [2, 2],
    }

    clean()
    print(f'Human turn [{h_choice}]')
    render(board, c_choice, h_choice)

    while move < 1 or move > 9:
        try:
            move = int(input('Use numpad (1..9): '))
            coord = moves[move]
            can_move = set_move(coord[0], coord[1], HUMAN)

            if not can_move:
                print('Bad move')
                move = -1
        except (EOFError, KeyboardInterrupt):
            print('Bye')
            exit()
        except (KeyError, ValueError):
            print('Bad choice')


def main():
    """
    Main function that calls all functions
    """
    clean()
    h_choice = ''  # X or O
    c_choice = ''  # X or O
    first = ''  # if human is the first

    # Human chooses X or O to play
    while h_choice != 'O' and h_choice != 'X':
        try:
            print('')
            h_choice = input('Choose X or O\nChosen: ').upper()
        except (EOFError, KeyboardInterrupt):
            print('Bye')
            exit()
        except (KeyError, ValueError):
            print('Bad choice')

    # Setting computer's choice
    if h_choice == 'X':
        c_choice = 'O'
    else:
        c_choice = 'X'

    # Human may starts first
    clean()
    while first != 'Y' and first != 'N':
        try:
            first = input('First to start?[y/n]: ').upper()
        except (EOFError, KeyboardInterrupt):
            print('Bye')
            exit()
        except (KeyError, ValueError):
            print('Bad choice')

    # Main loop of this game
    while len(empty_cells(board)) > 0 and not game_over(board):
        if first == 'N':
            ai_turn(c_choice, h_choice)
            first = ''

        human_turn(c_choice, h_choice)
        ai_turn(c_choice, h_choice)

    # Game over message
    if wins(board, HUMAN):
        clean()
        print(f'Human turn [{h_choice}]')
        render(board, c_choice, h_choice)
        print('YOU WIN!')
    elif wins(board, COMP):
        clean()
        print(f'Computer turn [{c_choice}]')
        render(board, c_choice, h_choice)
        print('YOU LOSE!')
    else:
        clean()
        render(board, c_choice, h_choice)
        print('DRAW!')

    exit()

main()

Naive Learning

import itertools
import time

import numpy as np
import cv2

from moviepy.editor import VideoClip


WORLD_HEIGHT = 4
WORLD_WIDTH = 4
WALL_FRAC = .2
NUM_WINS = 5
NUM_LOSE = 10


class GridWorld:

    def __init__(self, world_height=3, world_width=4, discount_factor=.5, default_reward=-.5, wall_penalty=-.6,
                 win_reward=5., lose_reward=-10., viz=True, patch_side=120, grid_thickness=2, arrow_thickness=3,
                 wall_locs=[[1, 1], [1, 2]], win_locs=[[0, 3]], lose_locs=[[1, 3]], start_loc=[0, 0],
                 reset_prob=.2):
        self.world = np.ones([world_height, world_width]) * default_reward
        self.reset_prob = reset_prob
        self.world_height = world_height
        self.world_width = world_width
        self.wall_penalty = wall_penalty
        self.win_reward = win_reward
        self.lose_reward = lose_reward
        self.default_reward = default_reward
        self.discount_factor = discount_factor
        self.patch_side = patch_side
        self.grid_thickness = grid_thickness
        self.arrow_thickness = arrow_thickness
        self.wall_locs = np.array(wall_locs)
        self.win_locs = np.array(win_locs)
        self.lose_locs = np.array(lose_locs)
        self.at_terminal_state = False
        self.auto_reset = True
        self.random_respawn = True
        self.step = 0
        self.viz_canvas = None
        self.viz = viz
        self.path_color = (128, 128, 128)
        self.wall_color = (0, 255, 0)
        self.win_color = (0, 0, 255)
        self.lose_color = (255, 0, 0)
        self.world[self.wall_locs[:, 0], self.wall_locs[:, 1]] = self.wall_penalty
        self.world[self.lose_locs[:, 0], self.lose_locs[:, 1]] = self.lose_reward
        self.world[self.win_locs[:, 0], self.win_locs[:, 1]] = self.win_reward
        spawn_condn = lambda loc: self.world[loc[0], loc[1]] == self.default_reward
        self.spawn_locs = np.array([loc for loc in itertools.product(np.arange(self.world_height),
                                                                     np.arange(self.world_width))
                                    if spawn_condn(loc)])
        self.start_state = np.array(start_loc)
        self.bot_rc = None
        self.reset()
        self.actions = [self.up, self.left, self.right, self.down, self.noop]
        self.action_labels = ['UP', 'LEFT', 'RIGHT', 'DOWN', 'NOOP']
        self.q_values = np.ones([self.world.shape[0], self.world.shape[1], len(self.actions)]) * 1. / len(self.actions)
        if self.viz:
            self.init_grid_canvas()
            self.video_out_fpath = 'shm_dqn_gridsolver-' + str(time.time()) + '.mp4'
            self.clip = VideoClip(self.make_frame, duration=15)

    def make_frame(self, t):
        self.action()
        frame = self.highlight_loc(self.viz_canvas, self.bot_rc[0], self.bot_rc[1])
        return frame

    def check_terminal_state(self):
        if self.world[self.bot_rc[0], self.bot_rc[1]] == self.lose_reward \
                or self.world[self.bot_rc[0], self.bot_rc[1]] == self.win_reward:
            self.at_terminal_state = True
            # print('------++++---- TERMINAL STATE ------++++----')
            # if self.world[self.bot_rc[0], self.bot_rc[1]] == self.win_reward:
            #     print('GAME WON! :D')
            # elif self.world[self.bot_rc[0], self.bot_rc[1]] == self.lose_reward:
            #     print('GAME LOST! :(')
            if self.auto_reset:
                self.reset()

    def reset(self):
        # print('Resetting')
        if not self.random_respawn:
            self.bot_rc = self.start_state.copy()
        else:
            self.bot_rc = self.spawn_locs[np.random.choice(np.arange(len(self.spawn_locs)))].copy()
        self.at_terminal_state = False

    def up(self):
        action_idx = 0
        # print(self.action_labels[action_idx])
        new_r = self.bot_rc[0] - 1
        if new_r < 0 or self.world[new_r, self.bot_rc[1]] == self.wall_penalty:
            return self.wall_penalty, action_idx
        self.bot_rc[0] = new_r
        reward = self.world[self.bot_rc[0], self.bot_rc[1]]
        self.check_terminal_state()
        return reward, action_idx

    def left(self):
        action_idx = 1
        # print(self.action_labels[action_idx])
        new_c = self.bot_rc[1] - 1
        if new_c < 0 or self.world[self.bot_rc[0], new_c] == self.wall_penalty:
            return self.wall_penalty, action_idx
        self.bot_rc[1] = new_c
        reward = self.world[self.bot_rc[0], self.bot_rc[1]]
        self.check_terminal_state()
        return reward, action_idx

    def right(self):
        action_idx = 2
        # print(self.action_labels[action_idx])
        new_c = self.bot_rc[1] + 1
        if new_c >= self.world.shape[1] or self.world[self.bot_rc[0], new_c] == self.wall_penalty:
            return self.wall_penalty, action_idx
        self.bot_rc[1] = new_c
        reward = self.world[self.bot_rc[0], self.bot_rc[1]]
        self.check_terminal_state()
        return reward, action_idx

    def down(self):
        action_idx = 3
        # print(self.action_labels[action_idx])
        new_r = self.bot_rc[0] + 1
        if new_r >= self.world.shape[0] or self.world[new_r, self.bot_rc[1]] == self.wall_penalty:
            return self.wall_penalty, action_idx
        self.bot_rc[0] = new_r
        reward = self.world[self.bot_rc[0], self.bot_rc[1]]
        self.check_terminal_state()
        return reward, action_idx

    def noop(self):
        action_idx = 4
        # print(self.action_labels[action_idx])
        reward = self.world[self.bot_rc[0], self.bot_rc[1]]
        self.check_terminal_state()
        return reward, action_idx

    def qvals2probs(self, q_vals, epsilon=1e-4):
        action_probs = q_vals - q_vals.min() + epsilon
        action_probs = action_probs / action_probs.sum()
        return action_probs

    def action(self):
        # print('================ ACTION =================')
        if self.at_terminal_state:
            print('At terminal state, please call reset()')
            exit()
        # print('Start position:', self.bot_rc)
        start_bot_rc = self.bot_rc[0], self.bot_rc[1]
        q_vals = self.q_values[self.bot_rc[0], self.bot_rc[1]]
        action_probs = self.qvals2probs(q_vals)
        reward, action_idx = np.random.choice(self.actions, p=action_probs)()
        # print('End position:', self.bot_rc)
        # print('Reward:', reward)
        alpha = np.exp(-self.step / 10e9)
        self.step += 1
        qv = (1 - alpha) * q_vals[action_idx] + alpha * (reward + self.discount_factor
                                                         * self.q_values[self.bot_rc[0], self.bot_rc[1]].max())
        self.q_values[start_bot_rc[0], start_bot_rc[1], action_idx] = qv
        if self.viz:
            self.update_viz(start_bot_rc[0], start_bot_rc[1])
        if np.random.rand() < self.reset_prob:
            # print('-----> Randomly resetting to a random spawn point with probability', self.reset_prob)
            self.reset()

    def highlight_loc(self, viz_in, i, j):
        starty = i * (self.patch_side + self.grid_thickness)
        endy = starty + self.patch_side
        startx = j * (self.patch_side + self.grid_thickness)
        endx = startx + self.patch_side
        viz = viz_in.copy()
        cv2.rectangle(viz, (startx, starty), (endx, endy), (255, 255, 255), thickness=self.grid_thickness)
        return viz

    def update_viz(self, i, j):
        starty = i * (self.patch_side + self.grid_thickness)
        endy = starty + self.patch_side
        startx = j * (self.patch_side + self.grid_thickness)
        endx = startx + self.patch_side
        patch = np.zeros([self.patch_side, self.patch_side, 3]).astype(np.uint8)
        if self.world[i, j] == self.default_reward:
            patch[:, :, :] = self.path_color
        elif self.world[i, j] == self.wall_penalty:
            patch[:, :, :] = self.wall_color
        elif self.world[i, j] == self.win_reward:
            patch[:, :, :] = self.win_color
        elif self.world[i, j] == self.lose_reward:
            patch[:, :, :] = self.lose_color
        if self.world[i, j] == self.default_reward:
            action_probs = self.qvals2probs(self.q_values[i, j])
            x_component = action_probs[2] - action_probs[1]
            y_component = action_probs[0] - action_probs[3]
            magnitude = 1. - action_probs[-1]
            s = self.patch_side // 2
            x_patch = int(s * x_component)
            y_patch = int(s * y_component)
            arrow_canvas = np.zeros_like(patch)
            vx = s + x_patch
            vy = s - y_patch
            cv2.arrowedLine(arrow_canvas, (s, s), (vx, vy), (255, 255, 255), thickness=self.arrow_thickness,
                            tipLength=0.5)
            gridbox = (magnitude * arrow_canvas + (1 - magnitude) * patch).astype(np.uint8)
            self.viz_canvas[starty:endy, startx:endx] = gridbox
        else:
            self.viz_canvas[starty:endy, startx:endx] = patch

    def init_grid_canvas(self):
        org_h, org_w = self.world_height, self.world_width
        viz_w = (self.patch_side * org_w) + (self.grid_thickness * (org_w - 1))
        viz_h = (self.patch_side * org_h) + (self.grid_thickness * (org_h - 1))
        self.viz_canvas = np.zeros([viz_h, viz_w, 3]).astype(np.uint8)
        for i in range(org_h):
            for j in range(org_w):
                self.update_viz(i, j)

    def solve(self):
        if not self.viz:
            while True:
                self.action()
        else:
            self.clip.write_videofile(self.video_out_fpath, fps=460)


def gen_world_config(h, w, wall_frac=.5, num_wins=2, num_lose=3):
    n = h * w
    num_wall_blocks = int(wall_frac * n)
    wall_locs = (np.random.rand(num_wall_blocks, 2) * [h, w]).astype(np.int)
    win_locs = (np.random.rand(num_wins, 2) * [h, w]).astype(np.int)
    lose_locs = (np.random.rand(num_lose, 2) * [h, w]).astype(np.int)
    return wall_locs, win_locs, lose_locs


if __name__ == '__main__':
    wall_locs, win_locs, lose_locs = gen_world_config(WORLD_HEIGHT, WORLD_WIDTH, WALL_FRAC, NUM_WINS, NUM_LOSE)
    g = GridWorld(world_height=WORLD_HEIGHT, world_width=WORLD_WIDTH,
                  wall_locs=wall_locs, win_locs=win_locs, lose_locs=lose_locs, viz=True)
    g.solve()
    k = 0