Spiga

Cut HTML string without breaking the tags

by Gabi Solomon

On a recent project i had to take a HTML source code and break it into several pieces, all with ought breaking the HTML tags. After struggling for a while i started googling for a solution from the most inspired developers :D . After some time of trying different codes i managed to come upon the code below. It is a method from cakephp and it did the job perfectly.

I hope your lucky to find this article and save your self some time.
Cheers

PHP:
  1. /**
  2. * Truncates text.
  3. *
  4. * Cuts a string to the length of $length and replaces the last characters
  5. * with the ending if the text is longer than length.
  6. *
  7. * @param string  $text String to truncate.
  8. * @param integer $length Length of returned string, including ellipsis.
  9. * @param string  $ending Ending to be appended to the trimmed string.
  10. * @param boolean $exact If false, $text will not be cut mid-word
  11. * @param boolean $considerHtml If true, HTML tags would be handled correctly
  12. * @return string Trimmed string.
  13. */
  14.     function truncate($text, $length = 100, $ending = '...', $exact = true, $considerHtml = false) {
  15.         if ($considerHtml) {
  16.             // if the plain text is shorter than the maximum length, return the whole text
  17.             if (strlen(preg_replace('/<.*?>/', '', $text)) <= $length) {
  18.                 return $text;
  19.             }
  20.            
  21.             // splits all html-tags to scanable lines
  22.             preg_match_all('/(<.+?>)?([^<>]*)/s', $text, $lines, PREG_SET_ORDER);
  23.    
  24.             $total_length = strlen($ending);
  25.             $open_tags = array();
  26.             $truncate = '';
  27.            
  28.             foreach ($lines as $line_matchings) {
  29.                 // if there is any html-tag in this line, handle it and add it (uncounted) to the output
  30.                 if (!empty($line_matchings[1])) {
  31.                     // if it's an "empty element" with or without xhtml-conform closing slash (f.e. <br/>)
  32.                     if (preg_match('/^<(\s*.+?\/\s*|\s*(img|br|input|hr|area|base|basefont|col|frame|isindex|link|meta|param)(\s.+?)?)>$/is', $line_matchings[1])) {
  33.                         // do nothing
  34.                     // if tag is a closing tag (f.e. </b>)
  35.                     } else if (preg_match('/^<\s*\/([^\s]+?)\s*>$/s', $line_matchings[1], $tag_matchings)) {
  36.                         // delete tag from $open_tags list
  37.                         $pos = array_search($tag_matchings[1], $open_tags);
  38.                         if ($pos !== false) {
  39.                             unset($open_tags[$pos]);
  40.                         }
  41.                     // if tag is an opening tag (f.e. <b>)
  42.                     } else if (preg_match('/^<\s*([^\s>!]+).*?>$/s', $line_matchings[1], $tag_matchings)) {
  43.                         // add tag to the beginning of $open_tags list
  44.                         array_unshift($open_tags, strtolower($tag_matchings[1]));
  45.                     }
  46.                     // add html-tag to $truncate'd text
  47.                     $truncate .= $line_matchings[1];
  48.                 }
  49.                
  50.                 // calculate the length of the plain text part of the line; handle entities as one character
  51.                 $content_length = strlen(preg_replace('/&[0-9a-z]{2,8};|&#[0-9]{1,7};|&#x[0-9a-f]{1,6};/i', ' ', $line_matchings[2]));
  52.                 if ($total_length+$content_length> $length) {
  53.                     // the number of characters which are left
  54.                     $left = $length - $total_length;
  55.                     $entities_length = 0;
  56.                     // search for html entities
  57.                     if (preg_match_all('/&[0-9a-z]{2,8};|&#[0-9]{1,7};|&#x[0-9a-f]{1,6};/i', $line_matchings[2], $entities, PREG_OFFSET_CAPTURE)) {
  58.                         // calculate the real length of all entities in the legal range
  59.                         foreach ($entities[0] as $entity) {
  60.                             if ($entity[1]+1-$entities_length <= $left) {
  61.                                 $left--;
  62.                                 $entities_length += strlen($entity[0]);
  63.                             } else {
  64.                                 // no more characters left
  65.                                 break;
  66.                             }
  67.                         }
  68.                     }
  69.                     $truncate .= substr($line_matchings[2], 0, $left+$entities_length);
  70.                     // maximum lenght is reached, so get off the loop
  71.                     break;
  72.                 } else {
  73.                     $truncate .= $line_matchings[2];
  74.                     $total_length += $content_length;
  75.                 }
  76.                
  77.                 // if the maximum length is reached, get off the loop
  78.                 if($total_length>= $length) {
  79.                     break;
  80.                 }
  81.             }
  82.         } else {
  83.             if (strlen($text) <= $length) {
  84.                 return $text;
  85.             } else {
  86.                 $truncate = substr($text, 0, $length - strlen($ending));
  87.             }
  88.         }
  89.        
  90.         // if the words shouldn't be cut in the middle...
  91.         if (!$exact) {
  92.             // ...search the last occurance of a space...
  93.             $spacepos = strrpos($truncate, ' ');
  94.             if (isset($spacepos)) {
  95.                 // ...and cut the text in this position
  96.                 $truncate = substr($truncate, 0, $spacepos);
  97.             }
  98.         }
  99.        
  100.         // add the defined ending to the text
  101.         $truncate .= $ending;
  102.        
  103.         if($considerHtml) {
  104.             // close all unclosed html-tags
  105.             foreach ($open_tags as $tag) {
  106.                 $truncate .= '</' . $tag . '>';
  107.             }
  108.         }
  109.        
  110.         return $truncate;
  111.        
  112.     }

Related Posts

  • Hey @ Author,you might want to consider modifying it because if you included HTML tags within the truncated part it'l break.
  • Worked for me, thank you very much!
  • I googled and found exactly what I was looking for in this blog post. Thank you, saved me a lot of time! :)
  • KDV
    this peice code is awesome...it really saved me a days work
  • Hi,
    I've modified your version a little bit, because it was breaking tags if $exact=false and it hits space in the middle of the tag. Really simple, just adding another variable, $doingtag.
    You can check it our here http://www.securityhacking.tk/2010/02/cut-html-string-without-breaking-the-tags/
  • yogix
    Nice one! Thank you. I´ve been searching for this few months already!
  • This is brilliant cheers! Saved me hours!
  • too old version, on line 96 cutting html text '
    Google was here.
  • Thanks!
    Your work saved me hours of work.
  • Roly
    Thanks I was looking for something that does exactly this.
  • what if set '$exact=false' with "...the last space is before tag nospace</html>" ? the tag '' will be lost, isn't it?
  • @kami
    if the exact parameters is true, the text will have an exact lenght and words will be cut, if false, the text might have a few letters extra because it will not cut the words.
  • Snypy
    Thumbs up! ;-)

    Thanks, really helped me. Since I used cuttext method, I was wondering how to cut html code without loosing tags. Now, you solved my problem :-)
  • @phong-tt
    you welcome
  • phong-tt
    thanks for your code man, i'm keep searching for a while until now :)
  • @lecone you welcome
  • lecone
    i finding the kind of this code so long.... thank s for your code!
  • Thomas
    Works fine. It does cut off UTF-8 encoded strings though, so that could be an improvement to scan for UTF-8 sequences and not cut right into them...
  • Thanks for the code snippet! I just started printing out short excerpts from a HTML page and I didn't want to strip_tags() just to get it to display right. ;)
  • you welcome
  • Sean NIeuwoudt
    Thanks for this code, works like a charm
  • try it
  • fornetti
    I do not believe this
blog comments powered by Disqus