12-06-2007 05:07 AM
12-14-2007 05:10 AM
import java.io.File;
import java.util.Map;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.repo.content.transform.AbstractContentTransformer;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.ContentWriter;
import org.alfresco.util.TempFileProvider;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.htmlparser.beans.StringBean;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.ErrorHandler;
import org.xml.sax.Parser;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.ParserFactory;
/**
* Based on Alfresco's HtmlParserContentTransformer implementation.
*/
public class XHtmlParserContentTransformer extends AbstractContentTransformer {
private static final Log logger = LogFactory.getLog(XHtmlParserContentTransformer.class);
/**
* Only support XHTML to TEXT.
*/
public double getReliability(String sourceMimetype, String targetMimetype)
{
if (!MimetypeMap.MIMETYPE_XHTML.equals(sourceMimetype) || !MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype)) {
// only support XHTML -> TEXT
return 0.0;
} else {
return 1.0;
}
}
public void transformInternal(ContentReader reader, ContentWriter writer, Map<String, Object> options) throws Exception
{
// we can only work from a file
File xhtmlFile = TempFileProvider.createTempFile("HtmlParserContentTransformer_", ".html");
reader.getContent(xhtmlFile);
// get a parser (TODO: create a pool of parsers for efficiency)
final String parserName = "org.apache.xerces.parsers.SAXParser";
XMLReader parser = (XMLReader) Class.forName(parserName).newInstance();
parser.setFeature("http://xml.org/sax/features/validation", false);
// parser.setFeature( "http://xml.org/sax/features/namespaces", false);
// parser.setFeature( "http://apache.org/xml/features/validation/schema",
// setSchemaSupport );
// parser.setFeature(
// "http://apache.org/xml/features/validation/schema-full-checking", false);
parser.setFeature("http://apache.org/xml/features/continue-after-fatal-error", true);
// create the extractor
Converter converter = new Converter(parser, xhtmlFile);
// extract
String text = converter.getText();
writer.putContent(text);
}
private class Converter extends DefaultHandler {
private XMLReader parser;
private StringBuilder text;
public Converter(XMLReader parser, File xhtmlFile) throws Exception
{
this.parser = parser;
this.text = new StringBuilder();
// set up parser
parser.setContentHandler(this);
parser.setErrorHandler(this);
// parse
String path = "file://" + xhtmlFile.getAbsolutePath();
long before = System.currentTimeMillis();
parser.parse(path);
long after = System.currentTimeMillis();
logger.debug("Conversion time: " + (after - before) + "ms.");
}
public void characters(char[] ch, int start, int length) throws SAXException
{
text.append(' ');
text.append(ch, start, length);
}
public final String getText()
{
final String words = text.toString();
if (logger.isDebugEnabled())
logger.debug("Text is: "+words);
return words;
}
}
}
<bean id="transformer.XHtmlParser" class="org.my.module.mypackage.transformers.XHtmlParserContentTransformer" parent="baseContentTransformer" />
11-06-2008 08:01 AM
Tags
Find what you came for
We want to make your experience in Hyland Connect as valuable as possible, so we put together some helpful links.