An optimization example

This page was inspired by a question posed in comp.lang.c. What is the best way to find a 2 character pattern in an unsigned character array of known length? The array can contain the character '\0', so its not a string.

The obvious solution is to just walk the array looking for the two characters as shown below:


	int
	try_1(
		unsigned	len,
		unsigned char 	str[],
		unsigned char	pat1,
		unsigned char	pat2
		)
	{
	  unsigned	i;
	
	  if( len <= 1 ) {
	    return 0;
	  }
	
	  for( i = 0; i < len - 1; i++ ) {
	    if( str[i] == pat1 && str[i+1] == pat2 ) {
	      return 1;
	    }
	  }
	  return 0;
	}

The code above loads characters twice; once as str[i+1] and then as str[i]. An obvious optimization is to avoid loading the same character twice.


	int
	try_2(
		unsigned	len,
		unsigned char 	str[],
		unsigned char	pat1,
		unsigned char	pat2
		)
	{
	  int	i;
	  int	c;
	
	  if( len <= 1 ) {
	    return 0;
	  }
	
	  i = 0;
	  while( 1 ) {
	    /* can't generate a match after first */
	    if( i == len - 1 ) {
	      return 0;
	    }
	    c = str[i];
	    i++;
	match_first:;
	    if( c == pat1 ) {
	      c = str[i];
	      i++;
	      if( c == pat2 ) {
	        return 1;
	      }
	      if( i == len ) {
	        return 0;
	      }
	      goto match_first;
	    }
	  }
	}

The code above has two stopping conditions:

The two stopping conditions can be reduced to one by adding the pattern at the end of the existing array. This way the pattern is guaranteed to be found, so the end of array test is not needed. However, after the pattern is found, we have to test whether the pattern was the one we added.


	int
	try_3(
		unsigned	len,
		unsigned char 	str[],
		unsigned char	pat1,
		unsigned char	pat2
		)
	{
	  int	i;
	  int	c;
	
	  str[len] = pat1;
	  str[len+1] = pat2;
	
	  i = 0;
	  while( 1 ) {
	    c = str[i];
	    i++;
	match_first:;
	    if( c == pat1 ) {
	      c = str[i];
	      i++;
	      if( c == pat2 ) {
	        goto found_match;
	      }
	      goto match_first;
	    }
	  }
	
	found_match:;
	  return i <= len;
	}

One problem with this code is that it is not always possible to write beyond the end of the array. In that case, we can still use the same idea, by writing the pattern over the last two characters of the existing array. In that case we have to do extra work to save and restore those last two characters.


	int
	try_4(
		unsigned	len,
		unsigned char 	str[],
		unsigned char	pat1,
		unsigned char	pat2
		)
	{
	  int	i;
	  int	c;
	  int	old_1;
	  int	old_2;
	
	  if( len <= 1 ) {
	    return 0;
	  }
	  old_1 = str[len-2];
	  old_2 = str[len-1];
	  if( old_1 == pat1 && old_2 == pat2 ) {
	    return 1;
	  }
	  if( len == 2 ) {
	    return 0;
	  }
	  if( old_1 == pat2 && str[len-3] == pat1 ) {
	    return 1;
	  }
	  if( len == 3 ) {
	    return 0;
	  }
	
	  str[len-2] = pat1;
	  str[len-1] = pat2;
	
	  i = 0;
	  while( 1 ) {
	    c = str[i];
	    i++;
	match_first:;
	    if( c == pat1 ) {
	      c = str[i];
	      i++;
	      if( c == pat2 ) {
	        goto found_match;
	      }
	      goto match_first;
	    }
	  }
	
	found_match:;
	  str[len-2] = old_1;
	  str[len-1] = old_2;
	
	  return i <= len-1;
	}

The codes above operates read one character at a time from memory. However, most processors efficiently load multiple bytes from memory, and operate on multiple bytes at a time. We can exploit this ability to produce code, that on many high-end modern processors, should run faster than the character-at-a-time codes shown above.

Before we get into the following code, a warning: this code is non-portable for at least 2 reasons.


	int
	try_5(
		unsigned	len,
		unsigned char	str[],
		unsigned char	pat1,
		unsigned char	pat2
		)
	{
	  unsigned	len_4 = (len-1)>>2;
	  unsigned	mask1 = (pat1<<24)|(pat1<<16)|(pat1<<8)|(pat1);
	  unsigned	mask2 = (pat2<<24)|(pat2<<16)|(pat2<<8)|(pat2);
	  unsigned	ones = 0x7f7f7f7f;
	  unsigned	compl = ~ones;
	  unsigned *	p1 = (unsigned *)str;
	  unsigned *	p2 = (unsigned *)(str+1);
	  unsigned	w1;
	  unsigned	w2;
	  unsigned	v1;
	  unsigned	v2;
	  unsigned	i;
	
	  if( len <= 1 ) {
	    return 0;
	  }
	
	  for( i = 0; i < len_4; i++ ) {
	    w1 = p1[i];
	    w1 ^= mask1;
	    v1 = w1&ones;
	    v1 += ones;
	    v1 |= w1;
	    v1 &= compl;
	    if( v1 != compl ) {
	
	      w2 = p2[i];
	      w2 ^= mask2;
	      v2 = w2&ones;
	      v2 += ones;
	      v2 |= w2;
	      v1 |= v2;
	      v1 &= compl;
	      if( v1 != compl ) {
		return 1;
	      }
	    }
	  }
	
	
	  for( i = len_4*4; i < len-1; i++ ) {
	    if( str[i] == pat1 && str[i+1] == pat2 ) {
	      return 1;
	    }
	  }
	  return 0;
	}

To understand how this code works, consider the following problem: given an octet, without using compares, return 0x00 if the given octet is 0 otherwise return 0x80.

We have glossed over one issue - what happens if we add 0x7f to an octet between 0x81 and 0xff? In that case the sum overflows, producing a number of greater than or equal to 0x100. This is not an issue if we are only dealing with one octet. The high-order bit is 1 because of the or-ing with the original octet, not because of the sum.

Now, we can take this idea and extend it to detecting whether there is a 0 octet in a word. However we now have to worry about overflow; the overflow from one octet can affect our results. We suppress this by first masking each octet in the group with 0x7f. The following code checks for the presence of a 0 octet in a 4 octet quantity.


	mask = var & 0x7f7f7f7f;	/* suppress overflow */
	sum  = mask + 0x7f7f7f7f;	/* set high order bit for +ve octets */
	or   = sum | var;		/* set high order bit for -ve octets */
	and  = or & 0x8080808;		/* remove all but high-order bits */
	is_0 = and != 0x80808080;	/* if one octet was 0, then value will
					 * be something like 0x80.00.80.80
					 */

To search for a pattern instead of 0s, we xor each octet with the octet being searched for; if they are equal the result will be 0. For multiple octets, xor with the concatenation of the sequence.

To adapt the multi-byte approach for use on processors that do not efficiently support unaligned loads, the following modifications can be made:

All of the approaches above examine every octet. However, if the unsigned characters in the pattern are different then it may be more efficient to examine every other character.


	int
	try_6(
		unsigned	len,
		unsigned char 	str[],
		unsigned char	pat1,
		unsigned char	pat2
		)
	{
	  unsigned	i;
	  int		hit;
	
	  if( len <= 1 ) {
	    return 0;
	  }
	  if( str[0] == pat1 && str[1] == pat2 ) {
	    return 1;
	  }
	  if( str[len-2] == pat1 && str[len-1] == pat2 ) {
	    return 1;
	  }
	
	  for( i = 1; i <<len-1; i+=2 ) {
	    if( str[i] == pat1 )
	      if( str[i+1] == pat2 ) {
	        return 1;
	      }
	    }
	    else if( str[i] == pat2 ) {
	      if( str[i-1] == pat1 ) {
		return 1;
	      }
	    }
	  }
	  return 0;
	}

If the pattern is constant (or rarely changes), we can replace the two comparisons in the loop with a table lookup.


	void
	setup_try_7(
		unsigned char	pat1,
		unsigned char	pat2,
		unsigned char	tbl[]
		)
	{
	  for( i = 0; i <<(1<<CHAR_BIT); i++ ) {
	    tbl[i] = 0;
	  }
	  tbl[pat1] = 1;
	  tbl[pat2] = 2;
	}

	int
	try_7(
		unsigned	len,
		unsigned char 	str[],
		unsigned char	pat1,
		unsigned char	pat2,
		unsigned char	tbl[]
		)
	{
	  unsigned	i;
	  int		hit;
	
	  if( len <= 1 ) {
	    return 0;
	  }
	  if( str[0] == pat1 && str[1] == pat2 ) {
	    return 1;
	  }
	  if( str[len-2] == pat1 && str[len-1] == pat2 ) {
	    return 1;
	  }
	
	
	  for( i = 1; i <<len-1; i+=2 ) {
	    hit = tbl[str[i]];
	    if( hit ) {
	      if( hit == 1 ) {
	        if( str[i+1] == pat2 ) {
		  return 1;
		}
	      }
	      else {
	        if( str[i-1] == pat2 ) {
		  return 1;
		}
	      }
	    }
	  }
	  return 0;
	}


Next Prev Main Top Feedback