-
-
Save nandajavarma/a3a6b62f34e74ec4c31674934327bbd3 to your computer and use it in GitHub Desktop.
#!/usr/lib/python | |
# -*- coding: utf-8 -*- | |
# | |
# This is a re-implementation of Python's timsort in Python | |
# itself. This is purely for learning purposes. :) | |
# References: [ | |
# https://en.wikipedia.org/wiki/Timsort, | |
# http://wiki.c2.com/?TimSort | |
# ] | |
# | |
# Copyright 2017 Nandaja Varma <[email protected]> | |
# | |
# This program is free software; you can redistribute it and/or modify | |
# it under the terms of the GNU General Public License as published by | |
# the Free Software Foundation; either version 3 of the License, or | |
# (at your option) any later version. | |
# | |
# This program is distributed in the hope that it will be useful, | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
# GNU Library General Public License for more details. | |
# | |
# You should have received a copy of the GNU General Public License | |
# along with this program; if not, write to the Free Software | |
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA | |
# 02110-1301, | |
# USA. | |
def binary_search(the_array, item, start, end): | |
if start == end: | |
if the_array[start] > item: | |
return start | |
else: | |
return start + 1 | |
if start > end: | |
return start | |
mid = (start + end)/ 2 | |
if the_array[mid] < item: | |
return binary_search(the_array, item, mid + 1, end) | |
elif the_array[mid] > item: | |
return binary_search(the_array, item, start, mid - 1) | |
else: | |
return mid | |
""" | |
Insertion sort that the heap sort uses if the array size is small or if | |
the size of the "run" is small | |
""" | |
def insertion_sort(the_array): | |
l = len(the_array) | |
for index in range(1, l): | |
value = the_array[index] | |
pos = binary_search(the_array, value, 0, index - 1) | |
the_array = the_array[:pos] + [value] + the_array[pos:index] + the_array[index+1:] | |
return the_array | |
def merge(left, right): | |
"""Takes two sorted lists and returns a single sorted list by comparing the | |
elements one at a time. | |
[1, 2, 3, 4, 5, 6] | |
""" | |
if not left: | |
return right | |
if not right: | |
return left | |
if left[0] < right[0]: | |
return [left[0]] + merge(left[1:], right) | |
return [right[0]] + merge(left, right[1:]) | |
def timsort(the_array): | |
runs, sorted_runs = [], [] | |
l = len(the_array) | |
new_run = [the_array[0]] | |
for i in range(1, l): | |
if i == l-1: | |
new_run.append(the_array[i]) | |
runs.append(new_run) | |
break | |
if the_array[i] < the_array[i-1]: | |
if not new_run: | |
runs.append([the_array[i-1]]) | |
new_run.append(the_array[i]) | |
else: | |
runs.append(new_run) | |
new_run = [] | |
else: | |
new_run.append(the_array[i]) | |
for each in runs: | |
sorted_runs.append(insertion_sort(each)) | |
sorted_array = [] | |
for run in sorted_runs: | |
sorted_array = merge(sorted_array, run) | |
print sorted_array |
I wrote up this Gist, which is the code as OP posted with PenguinLiam's suggestions
https://gist.github.com/bee-san/f6ccc000ab6527769999fd0a9ebf59de
@PenguinLiam Actually to make the code work on both python2 and python3, don't use round
method, using //
instead:
mid = (start + end) // 2
This is not Timsort! Timsort uses a stack with specific constraints on the top entries to decide which runs to merge and when.
@6r1d: I haven't worked through that code entirely, but it does not find existing runs or reverse them before performing insertion sort. I believe this is required for Timsort.
I think that line 90 should be new_run = [the_array[i]]
, this will not remove any values when divide the array into runs I guess.
This code work on both python2 and python3. Please use mid = int((start + end)/ 2)
or mid = (start + end) // 2
instead of mid = (start + end)/ 2
Using your algorithm I found a few flaws. First off, when identifying which is the mid value in binary_search, a round function should be used as mid can be a float otherwise and lists don't use floats for indexes. Second which is little confusing, it removed on average about 20% of the values from the list when sorting it. I found the problem to be in the section placing items into arrays. The solution I found was to remove this line:
runs.append([the_array[i-1]])
because it added duplicate items to the runs array, and add this line:
runs.append([the_array[i]])
to add an item to the runs array if new_run isnt empty