cancel
Showing results for 
Search instead for 
Did you mean: 

Indexing XHTML

hbf
Champ on-the-rise
Champ on-the-rise
Hi,

I see that currently, only the cm:content of nodes of type HTML but not of type XHTML are indexed to Lucene.

I'd like to contribute a transformer from XHTML to plain text, only question I have is: which library should I use?

I see form HtmlParserContentTransformer.java (in Alfesco SVN) that Alfresco currently uses http://htmlparser.sourceforge.net/ for HTML-to-plain-text conversion. I could not find, however, any info on whether this thing works for XHTML, too.

Any suggestions?

Kaspar
2 REPLIES 2

hbf
Champ on-the-rise
Champ on-the-rise
Here is a very simple and untested and maybe incomplete (w.r.t. efficiency, for example) version based on Xerces. It might serve as a starting point to others.


import java.io.File;
import java.util.Map;

import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.repo.content.transform.AbstractContentTransformer;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.ContentWriter;
import org.alfresco.util.TempFileProvider;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.htmlparser.beans.StringBean;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.ErrorHandler;
import org.xml.sax.Parser;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.ParserFactory;

/**
* Based on Alfresco's HtmlParserContentTransformer implementation.
*/
public class XHtmlParserContentTransformer extends AbstractContentTransformer {
  private static final Log logger = LogFactory.getLog(XHtmlParserContentTransformer.class);

  /**
   * Only support XHTML to TEXT.
   */
  public double getReliability(String sourceMimetype, String targetMimetype)
  {
    if (!MimetypeMap.MIMETYPE_XHTML.equals(sourceMimetype) || !MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype)) {
      // only support XHTML -> TEXT
      return 0.0;
    } else {
      return 1.0;
    }
  }

  public void transformInternal(ContentReader reader, ContentWriter writer, Map<String, Object> options) throws Exception
  {
    // we can only work from a file
    File xhtmlFile = TempFileProvider.createTempFile("HtmlParserContentTransformer_", ".html");
    reader.getContent(xhtmlFile);

    // get a parser (TODO: create a pool of parsers for efficiency)
    final String parserName = "org.apache.xerces.parsers.SAXParser";
    XMLReader parser = (XMLReader) Class.forName(parserName).newInstance();
    parser.setFeature("http://xml.org/sax/features/validation", false);
    // parser.setFeature( "http://xml.org/sax/features/namespaces", false);
    // parser.setFeature( "http://apache.org/xml/features/validation/schema",
    // setSchemaSupport );
    // parser.setFeature(
    // "http://apache.org/xml/features/validation/schema-full-checking", false);
    parser.setFeature("http://apache.org/xml/features/continue-after-fatal-error", true);

    // create the extractor
    Converter converter = new Converter(parser, xhtmlFile);

    // extract
    String text = converter.getText();

    writer.putContent(text);
  }

  private class Converter extends DefaultHandler {
    private XMLReader parser;
    private StringBuilder text;

    public Converter(XMLReader parser, File xhtmlFile) throws Exception
    {
      this.parser = parser;
      this.text = new StringBuilder();

      // set up parser
      parser.setContentHandler(this);
      parser.setErrorHandler(this);

      // parse
      String path = "file://" + xhtmlFile.getAbsolutePath();
      long before = System.currentTimeMillis();
      parser.parse(path);
      long after = System.currentTimeMillis();
      logger.debug("Conversion time: " + (after - before) + "ms.");
    }

    public void characters(char[] ch, int start, int length) throws SAXException
    {
      text.append(' ');
      text.append(ch, start, length);
    }

    public final String getText()
    {
      final String words = text.toString();
      if (logger.isDebugEnabled())
        logger.debug("Text is: "+words);
      return words;
    }

  }
}

You have to configure this using something like

 <bean id="transformer.XHtmlParser" class="org.my.module.mypackage.transformers.XHtmlParserContentTransformer" parent="baseContentTransformer" />

hbf
Champ on-the-rise
Champ on-the-rise
An updated version is available on this Wiki page.