filipsalo · December 5, 2008 19:23
diff --git a/build_carddata.php b/build_carddata.php
 <?php
 /**
 * Compare two strings by length first (descending),
 * alphabetically second (ascending)
 */
 function strlen_cmp($a, $b) {
  $diff = strlen($a) - strlen($b);
  return ($diff != 0) ? -$diff : strcmp($a, $b);
 }

 /**
 * Quote a string for use in a regular expression,
 * including the "/" separator
 */
 function quote_for_regex($str) {
  return preg_quote($str, "/");
 }

 /* Read the card data and build an array of (name => id) pairs */
 $lines = file("kortnamn.txt", FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
 foreach ($lines as $line) {
  list($id, $card) = explode(" ", $line, 2);
  $cards[$card] = (int) $id;
 }

 /* Sort the array by length */
 uksort($cards, "strlen_cmp");

 /* Build the regular expression */
 $card_regexes[] = implode("|", array_map("quote_for_regex",
                                         array_keys($cards)));

 /* ..and chop it up, if necessary */
 $RE_LIMIT = 30000;
 while (strlen(end($card_regexes)) > $RE_LIMIT) {
  $re = current($card_regexes);
  $i = strrpos(substr($re, 0, $RE_LIMIT), "|");
  $new_re = substr($re, $i+1);
  $card_regexes[key($card_regexes)] = substr($re, 0, $i);
  $card_regexes[] = $new_re;
 }

 /* Finish up the regexes with delimiters and the S option */
 reset($card_regexes);
 while (list($key, $regex) = each($card_regexes)) {
  $card_regexes[$key] = sprintf("/%s/S", $regex);
 }


 /* Output the generated php file */
 echo "<?php\n";
 printf("\$card_regexes = %s;\n", var_export($card_regexes, true));
 echo "?>\n";

 ?>
diff --git a/kortnamn.txt b/kortnamn.txt
 15094 "Ach! Hans, Run!"
 17880 1996 World Champion
 5629 Abandon Hope
 8127 Abandoned Outpost
 1421 Abbey Gargoyles
 ...
 3465 Zuran Enchanter
 3241 Zuran Orb
 3466 Zuran Spellcaster
 15048 Zzzyxas's Abyss
 15001 _____
diff --git a/linkify_cards.php b/linkify_cards.php
 <?php

 /* Oh, dear, look at the time! */
 $mtime = explode(' ', microtime());
 $starttime = $mtime[0] + $mtime[1];

 /**
 * Check that a regex match doesn't overlap with any of spans in $ignore
 * (We use this to filter matches in step 2.)
 */
 function no_overlaps($span) {
  global $ignore;
  list($s, $start) = $span;
  $end = $start + strlen($s);
  foreach ($ignore as $k => $ispan) {
    list($is, $a) = $ispan;
    $b = $a + strlen($is);
    if ($a <= $start && $start < $b || $a < $end && $end <= $b) {
      return false;
    }
  }
  return true;
 }


 /* STEP 0: get the generated card and regex definitions */
 require 'carddata.php';


 /* STEP 1: Get the input data  and identify any "untouchable" substrings */
 $data = implode(file('spel_mini.txt'));
 preg_match_all('/\[([^]\s]+)(\s+.*?)?\][^[]+\[\/\1\]|\[[^]]+\]/S',
               $data, $ignore, PREG_OFFSET_CAPTURE);
 $ignore = $ignore[0];


 /* STEP 2: Look for card names in the data */
 $matches = array();
 foreach ($card_regexes as $k => $regex) {
  preg_match_all($regex, $data, $candidates, PREG_OFFSET_CAPTURE);
  
  /* Exclude any matches that overlaps with untouchables */
  $candidates = array_filter($candidates[0], 'no_overlaps');

  /* Collect the remaining matches */
  $matches = array_merge($matches, $candidates);

  /* Add the matches to the ignore list, so we can exclude any overlapping
     matches from the remaining regexes */
  $ignore = array_merge($ignore, $matches);
 }


 /* STEP 3: Go through and actually do the replacement on the matches
   starting from the end, so we don't mess up the indices as we go
   along */

 /* Sort regex matches by their position, backwards */
 function position_cmp($a, $b) { return $b[1] - $a[1]; }
 usort($matches, "position_cmp");

 /* Replace the matching substring with something else. */
 foreach($matches as $k => $match) {
  list($str, $start) = $match;
  $data = substr_replace($data, sprintf('[card]%s[/card]', $str),
                         $start, strlen($str));
 }

 /* DONE: Output the result */
 echo $data;

 /* Brag about it. */
 $mtime = explode(' ', microtime());
 $totaltime = $mtime[0] + $mtime[1] - $starttime;
 printf("Gone in %.3f seconds.\n\n", $totaltime);
 ?>
	<?php
	/**
	* Compare two strings by length first (descending),
	* alphabetically second (ascending)
	*/
	function strlen_cmp($a, $b) {
	$diff = strlen($a) - strlen($b);
	return ($diff != 0) ? -$diff : strcmp($a, $b);
	}

	/**
	* Quote a string for use in a regular expression,
	* including the "/" separator
	*/
	function quote_for_regex($str) {
	return preg_quote($str, "/");
	}

	/* Read the card data and build an array of (name => id) pairs */
	$lines = file("kortnamn.txt", FILE_IGNORE_NEW_LINES \| FILE_SKIP_EMPTY_LINES);
	foreach ($lines as $line) {
	list($id, $card) = explode(" ", $line, 2);
	$cards[$card] = (int) $id;
	}

	/* Sort the array by length */
	uksort($cards, "strlen_cmp");

	/* Build the regular expression */
	$card_regexes[] = implode("\|", array_map("quote_for_regex",
	array_keys($cards)));

	/* ..and chop it up, if necessary */
	$RE_LIMIT = 30000;
	while (strlen(end($card_regexes)) > $RE_LIMIT) {
	$re = current($card_regexes);
	$i = strrpos(substr($re, 0, $RE_LIMIT), "\|");
	$new_re = substr($re, $i+1);
	$card_regexes[key($card_regexes)] = substr($re, 0, $i);
	$card_regexes[] = $new_re;
	}

	/* Finish up the regexes with delimiters and the S option */
	reset($card_regexes);
	while (list($key, $regex) = each($card_regexes)) {
	$card_regexes[$key] = sprintf("/%s/S", $regex);
	}


	/* Output the generated php file */
	echo "<?php\n";
	printf("\$card_regexes = %s;\n", var_export($card_regexes, true));
	echo "?>\n";

	?>
	15094 "Ach! Hans, Run!"
	17880 1996 World Champion
	5629 Abandon Hope
	8127 Abandoned Outpost
	1421 Abbey Gargoyles
	...
	3465 Zuran Enchanter
	3241 Zuran Orb
	3466 Zuran Spellcaster
	15048 Zzzyxas's Abyss
	15001 _____
	<?php

	/* Oh, dear, look at the time! */
	$mtime = explode(' ', microtime());
	$starttime = $mtime[0] + $mtime[1];

	/**
	* Check that a regex match doesn't overlap with any of spans in $ignore
	* (We use this to filter matches in step 2.)
	*/
	function no_overlaps($span) {
	global $ignore;
	list($s, $start) = $span;
	$end = $start + strlen($s);
	foreach ($ignore as $k => $ispan) {
	list($is, $a) = $ispan;
	$b = $a + strlen($is);
	if ($a <= $start && $start < $b \|\| $a < $end && $end <= $b) {
	return false;
	}
	}
	return true;
	}


	/* STEP 0: get the generated card and regex definitions */
	require 'carddata.php';


	/* STEP 1: Get the input data and identify any "untouchable" substrings */
	$data = implode(file('spel_mini.txt'));
	preg_match_all('/\[([^]\s]+)(\s+.*?)?\][^[]+\[\/\1\]\|\[[^]]+\]/S',
	$data, $ignore, PREG_OFFSET_CAPTURE);
	$ignore = $ignore[0];


	/* STEP 2: Look for card names in the data */
	$matches = array();
	foreach ($card_regexes as $k => $regex) {
	preg_match_all($regex, $data, $candidates, PREG_OFFSET_CAPTURE);

	/* Exclude any matches that overlaps with untouchables */
	$candidates = array_filter($candidates[0], 'no_overlaps');

	/* Collect the remaining matches */
	$matches = array_merge($matches, $candidates);

	/* Add the matches to the ignore list, so we can exclude any overlapping
	matches from the remaining regexes */
	$ignore = array_merge($ignore, $matches);
	}


	/* STEP 3: Go through and actually do the replacement on the matches
	starting from the end, so we don't mess up the indices as we go
	along */

	/* Sort regex matches by their position, backwards */
	function position_cmp($a, $b) { return $b[1] - $a[1]; }
	usort($matches, "position_cmp");

	/* Replace the matching substring with something else. */
	foreach($matches as $k => $match) {
	list($str, $start) = $match;
	$data = substr_replace($data, sprintf('[card]%s[/card]', $str),
	$start, strlen($str));
	}

	/* DONE: Output the result */
	echo $data;

	/* Brag about it. */
	$mtime = explode(' ', microtime());
	$totaltime = $mtime[0] + $mtime[1] - $starttime;
	printf("Gone in %.3f seconds.\n\n", $totaltime);
	?>