Spiga

Cut HTML string without breaking the tags

by Gabi Solomon

On a recent project i had to take a HTML source code and break it into several pieces, all with ought breaking the HTML tags. After struggling for a while i started googling for a solution from the most inspired developers :D . After some time of trying different codes i managed to come upon the code below. It is a method from cakephp and it did the job perfectly.

I hope your lucky to find this article and save your self some time.
Cheers

PHP:
  1. /**
  2. * Truncates text.
  3. *
  4. * Cuts a string to the length of $length and replaces the last characters
  5. * with the ending if the text is longer than length.
  6. *
  7. * @param string  $text String to truncate.
  8. * @param integer $length Length of returned string, including ellipsis.
  9. * @param string  $ending Ending to be appended to the trimmed string.
  10. * @param boolean $exact If false, $text will not be cut mid-word
  11. * @param boolean $considerHtml If true, HTML tags would be handled correctly
  12. * @return string Trimmed string.
  13. */
  14.     function truncate($text, $length = 100, $ending = '...', $exact = true, $considerHtml = false) {
  15.         if ($considerHtml) {
  16.             // if the plain text is shorter than the maximum length, return the whole text
  17.             if (strlen(preg_replace('/<.*?>/', '', $text)) <= $length) {
  18.                 return $text;
  19.             }
  20.            
  21.             // splits all html-tags to scanable lines
  22.             preg_match_all('/(<.+?>)?([^<>]*)/s', $text, $lines, PREG_SET_ORDER);
  23.    
  24.             $total_length = strlen($ending);
  25.             $open_tags = array();
  26.             $truncate = '';
  27.            
  28.             foreach ($lines as $line_matchings) {
  29.                 // if there is any html-tag in this line, handle it and add it (uncounted) to the output
  30.                 if (!empty($line_matchings[1])) {
  31.                     // if it's an "empty element" with or without xhtml-conform closing slash (f.e. <br/>)
  32.                     if (preg_match('/^<(\s*.+?\/\s*|\s*(img|br|input|hr|area|base|basefont|col|frame|isindex|link|meta|param)(\s.+?)?)>$/is', $line_matchings[1])) {
  33.                         // do nothing
  34.                     // if tag is a closing tag (f.e. </b>)
  35.                     } else if (preg_match('/^<\s*\/([^\s]+?)\s*>$/s', $line_matchings[1], $tag_matchings)) {
  36.                         // delete tag from $open_tags list
  37.                         $pos = array_search($tag_matchings[1], $open_tags);
  38.                         if ($pos !== false) {
  39.                             unset($open_tags[$pos]);
  40.                         }
  41.                     // if tag is an opening tag (f.e. <b>)
  42.                     } else if (preg_match('/^<\s*([^\s>!]+).*?>$/s', $line_matchings[1], $tag_matchings)) {
  43.                         // add tag to the beginning of $open_tags list
  44.                         array_unshift($open_tags, strtolower($tag_matchings[1]));
  45.                     }
  46.                     // add html-tag to $truncate'd text
  47.                     $truncate .= $line_matchings[1];
  48.                 }
  49.                
  50.                 // calculate the length of the plain text part of the line; handle entities as one character
  51.                 $content_length = strlen(preg_replace('/&[0-9a-z]{2,8};|&#[0-9]{1,7};|&#x[0-9a-f]{1,6};/i', ' ', $line_matchings[2]));
  52.                 if ($total_length+$content_length> $length) {
  53.                     // the number of characters which are left
  54.                     $left = $length - $total_length;
  55.                     $entities_length = 0;
  56.                     // search for html entities
  57.                     if (preg_match_all('/&[0-9a-z]{2,8};|&#[0-9]{1,7};|&#x[0-9a-f]{1,6};/i', $line_matchings[2], $entities, PREG_OFFSET_CAPTURE)) {
  58.                         // calculate the real length of all entities in the legal range
  59.                         foreach ($entities[0] as $entity) {
  60.                             if ($entity[1]+1-$entities_length <= $left) {
  61.                                 $left--;
  62.                                 $entities_length += strlen($entity[0]);
  63.                             } else {
  64.                                 // no more characters left
  65.                                 break;
  66.                             }
  67.                         }
  68.                     }
  69.                     $truncate .= substr($line_matchings[2], 0, $left+$entities_length);
  70.                     // maximum lenght is reached, so get off the loop
  71.                     break;
  72.                 } else {
  73.                     $truncate .= $line_matchings[2];
  74.                     $total_length += $content_length;
  75.                 }
  76.                
  77.                 // if the maximum length is reached, get off the loop
  78.                 if($total_length>= $length) {
  79.                     break;
  80.                 }
  81.             }
  82.         } else {
  83.             if (strlen($text) <= $length) {
  84.                 return $text;
  85.             } else {
  86.                 $truncate = substr($text, 0, $length - strlen($ending));
  87.             }
  88.         }
  89.        
  90.         // if the words shouldn't be cut in the middle...
  91.         if (!$exact) {
  92.             // ...search the last occurance of a space...
  93.             $spacepos = strrpos($truncate, ' ');
  94.             if (isset($spacepos)) {
  95.                 // ...and cut the text in this position
  96.                 $truncate = substr($truncate, 0, $spacepos);
  97.             }
  98.         }
  99.        
  100.         // add the defined ending to the text
  101.         $truncate .= $ending;
  102.        
  103.         if($considerHtml) {
  104.             // close all unclosed html-tags
  105.             foreach ($open_tags as $tag) {
  106.                 $truncate .= '</' . $tag . '>';
  107.             }
  108.         }
  109.        
  110.         return $truncate;
  111.        
  112.     }

Related Posts

  • fornetti

    I do not believe this

  • http://www.gsdesign.ro/ Gabi Solomon

    try it

  • Sean NIeuwoudt

    Thanks for this code, works like a charm

  • http://www.gsdesign.ro/ Gabi Solomon

    you welcome

  • http://xeoncross.com David

    Thanks for the code snippet! I just started printing out short excerpts from a HTML page and I didn't want to strip_tags() just to get it to display right. ;)

  • Thomas

    Works fine. It does cut off UTF-8 encoded strings though, so that could be an improvement to scan for UTF-8 sequences and not cut right into them...

  • lecone

    i finding the kind of this code so long.... thank s for your code!

  • http://www.gsdesign.ro/ Gabi Solomon

    @lecone you welcome

  • phong-tt

    thanks for your code man, i'm keep searching for a while until now :)

  • http://www.gsdesign.ro/ Gabi Solomon

    @phong-tt
    you welcome

  • Pingback: How do I truncate an HTML string without breaking the HTML code? « Dodona gives you answers

  • Snypy

    Thumbs up! ;-)

    Thanks, really helped me. Since I used cuttext method, I was wondering how to cut html code without loosing tags. Now, you solved my problem :-)

  • http://www.kamiyeye.com kami

    what if set '$exact=false' with "...the last space is before tag nospace</html>" ? the tag '' will be lost, isn't it?

    • http://intensedebate.com/people/solomongaby solomongaby

      @kami
      if the exact parameters is true, the text will have an exact lenght and words will be cut, if false, the text might have a few letters extra because it will not cut the words.

  • Roly

    Thanks I was looking for something that does exactly this.

  • http://vanco.ordanoski.name/ Vanco

    Thanks!
    Your work saved me hours of work.

  • http://www.google.com/ Google

    too old version, on line 96 cutting html text '<a href' by space and ruins html.

    Google was here.

  • http://www.reallyeasycart.co.uk/ Andrew Stilliard

    This is brilliant cheers! Saved me hours!

  • yogix

    Nice one! Thank you. I´ve been searching for this few months already!

  • http://www.securityhacking.tk/ Vlad

    Hi,
    I've modified your version a little bit, because it was breaking tags if $exact=false and it hits space in the middle of the tag. Really simple, just adding another variable, $doingtag.
    You can check it our here http://www.securityhacking.tk/2010/02/cut-html-...

  • KDV

    this peice code is awesome...it really saved me a days work

  • Pingback: Wordpress : the_content_limit() v.2 | Staicu Ionuţ-Bogdan - the Frontend Developer

  • Gonzalo

    Awesome! It works perfectly. Thank so much.

  • http://www.air-jordan-13.com air jordan 13

    Mark S. is definitely on the right track. If you want to get a professional looking email address, Id recommend buying your name domain name, like or
    Gucci sweaters
    If its common it might be difficult to get, however, be creative and you can usually find something.

  • http://www.gammelsaeter.com Paul G

    I googled and found exactly what I was looking for in this blog post. Thank you, saved me a lot of time! :)

  • http://www.innovavista.net Alejandro

    Worked for me, thank you very much!

  • http://twitter.com/brixterdeleon brixter

    Hey @ Author,you might want to consider modifying it because if you included HTML tags within the truncated part it'l break.

  • http://www.reallyeasycart.co.uk Ecommerce Software

    Cascading style sheets (CSS) will greatly reduce the amount of code within your web pages. This will also cut down on the amount of web space and bandwidth used thus saving you money for hosting your site.

  • Jama211

    Thanks, this helped me a lot.