03-02-2011 08:10 AM
<bean id="extracter.pdf" class="extractor.PDFCustomExtractor" parent="baseMetadataExtracter" >
<property name="inheritDefaultMapping">
<value>true</value>
</property>
<property name="mappingProperties">
<bean class="org.springframework.beans.factory.config.PropertiesFactoryBean">
<property name="location">
<value>classpath:alfresco/extension/custom-pdf-extractor-mappings.properties</value>
</property>
</bean>
</property>
</bean>
<bean id="extracter.off" class="extractor.MyOfficeMetadataExtractor" parent="baseMetadataExtracter" >
<property name="inheritDefaultMapping">
<value>true</value>
</property>
<property name="mappingProperties">
<bean class="org.springframework.beans.factory.config.PropertiesFactoryBean">
<property name="location">
<value>classpath:alfresco/extension/custom-office-extractor-mappings.properties</value>
</property>
</bean>
</property>
</bean>
public class PDFCustomExtractor extends AbstractMappingMetadataExtracter
{
private static final String KEY_AUTHOR = "author";
private static final String KEY_TITLE = "title";
private static final String KEY_SUBJECT = "subject";
private static final String KEY_CREATED = "created";
private static final String KEY_KEYWORDS = "keywords";
public static String[] SUPPORTED_MIMETYPES = { MimetypeMap.MIMETYPE_PDF };
private static Log log = LogFactory.getLog(PDFCustomExtractor.class);
public PDFCustomExtractor()
{
super(new HashSet<String>(Arrays.asList(SUPPORTED_MIMETYPES)));
}
public Map<String, Serializable> extractRaw(ContentReader reader)
throws Throwable
{
Map<String, Serializable> rawProperties = newRawMap();
PDDocument pdf = null;
InputStream is = null;
try
{
is = reader.getContentInputStream();
pdf = PDDocument.load(is);
if (pdf.isEncrypted()) {
//break label337;
}
PDDocumentInformation docInfo = pdf.getDocumentInformation();
putRawValue("author", docInfo.getAuthor(), rawProperties);
putRawValue("title", docInfo.getTitle(), rawProperties);
putRawValue("subject", docInfo.getSubject(), rawProperties);
String keywords = docInfo.getKeywords();
try
{
keywords = keywords.trim();
while (keywords.charAt(0) == '"') {
keywords = keywords.substring(1);
}
while (keywords.charAt(keywords.length() - 1) == '"') {
keywords = keywords.substring(0, keywords.length() - 1);
}
keywords = keywords.trim();
StringTokenizer stcomma = new StringTokenizer(keywords, ";");
while (stcomma.hasMoreTokens()) {
String token = stcomma.nextToken();
StringTokenizer sttoken = new StringTokenizer(token, "=");
putRawValue(sttoken.nextToken(), sttoken.nextToken(), rawProperties);
}
}
catch (Exception x) {
log.info("\n\nExtracter: " + x.toString() + ".\n");
}
if (keywords != null) {
log.info("\n\nKeywords es:" + keywords + ".\n");
putRawValue("keywords", keywords, rawProperties);
}
else {
log.info("\n\nKeywords es null.\n");
}
try
{
Calendar created = docInfo.getCreationDate();
label337: if (created != null)
{
putRawValue("created", created.getTime(), rawProperties);
}
}
catch (IOException localIOException)
{
}
}
finally
{
if (is != null)
try {
is.close(); } catch (IOException localIOException1) {
}
if (pdf != null) {
try {
pdf.close(); } catch (Throwable e) { e.printStackTrace(); }
}
}
log.info("\n\nPropiedades de vuelta:" + rawProperties.toString() + "\n");
return rawProperties;
}
}
03-02-2011 10:36 AM
public class PDFCustomExtractor extends TikaPoweredMetadataExtracter
{
protected static Log logger = LogFactory.getLog(PDFCustomExtractor.class);
public static ArrayList<String> SUPPORTED_MIMETYPES = buildSupportedMimetypes(
new String[] { MimetypeMap.MIMETYPE_PDF },
new PDFParser()
);
public PDFCustomExtractor()
{
super(SUPPORTED_MIMETYPES);
}
@Override
protected Parser getParser() {
return new PDFParser();
}
@Override
public Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable
{
Map<String, Serializable> rawProperties = newRawMap();
PDDocument pdf = null;
InputStream is = null;
try
{
is = reader.getContentInputStream();
pdf = PDDocument.load(is);
if (pdf.isEncrypted()) {
//break label337;
}
PDDocumentInformation docInfo = pdf.getDocumentInformation();
putRawValue("author", docInfo.getAuthor(), rawProperties);
putRawValue("title", docInfo.getTitle(), rawProperties);
putRawValue("subject", docInfo.getSubject(), rawProperties);
String keywords = docInfo.getKeywords();
try
{
keywords = keywords.trim();
while (keywords.charAt(0) == '"') {
keywords = keywords.substring(1);
}
while (keywords.charAt(keywords.length() - 1) == '"') {
keywords = keywords.substring(0, keywords.length() - 1);
}
keywords = keywords.trim();
StringTokenizer stcomma = new StringTokenizer(keywords, ";");
while (stcomma.hasMoreTokens()) {
String token = stcomma.nextToken();
StringTokenizer sttoken = new StringTokenizer(token, "=");
putRawValue(sttoken.nextToken(), sttoken.nextToken(), rawProperties);
}
}
catch (Exception x) {
logger.info("\n\nExtracter: " + x.toString() + ".\n");
}
if (keywords != null) {
logger.info("\n\nKeywords es:" + keywords + ".\n");
putRawValue("keywords", keywords, rawProperties);
}
else {
logger.info("\n\nKeywords es null.\n");
}
try
{
Calendar created = docInfo.getCreationDate();
label337: if (created != null)
{
putRawValue("created", created.getTime(), rawProperties);
}
}
catch (IOException localIOException)
{
}
}
finally
{
if (is != null)
try {
is.close(); } catch (IOException localIOException1) {
}
if (pdf != null) {
try {
pdf.close(); } catch (Throwable e) { e.printStackTrace(); }
}
}
logger.info("\n\nPropiedades de vuelta:" + rawProperties.toString() + "\n");
return rawProperties;
}
}
Tags
Find what you came for
We want to make your experience in Hyland Connect as valuable as possible, so we put together some helpful links.