cancel
Showing results for 
Search instead for 
Did you mean: 

Metadata extraction

jglezar
Champ in-the-making
Champ in-the-making
I need to extract custom metadata form Office files. I've modified the OfficeMetadataExtracter class to meet my needs. I have to problems:

1) The class/package org.apache.poi.hpsf.DocumentSummaryInformation included in Alfresco 2.1 isn't the latest and doesn't include the getCustomProperties() method, which is needed to extract custom metadata from Office files, so my question is how can I update org.apache.poi.hpsf.DocumentSummaryInformation to the latest version in Alfresco.

2) How can I deploy/incorporate my custom class into Alfresco? Does it need to be a .class file or a .jar file?

My files are as follows:

MyOfficeMetadataExtracter.java

/*
* Copyright (C) 2005 Jesper Steen Møller
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.

* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.

* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.

* As a special exception to the terms and conditions of version 2.0 of
* the GPL, you may redistribute this Program in connection with Free/Libre
* and Open Source Software ("FLOSS") applications as described in Alfresco's
* FLOSS exception.  You should have recieved a copy of the text describing
* the FLOSS exception, and it is also available here:
* http://www.alfresco.com/legal/licensing"
*/
//package org.alfresco.repo.content.metadata;

package org.alfrescox.repo.content.metadata;

import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Map;

import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.ContentIOException;
import org.alfresco.service.cmr.repository.ContentReader;
import org.apache.poi.hpsf.PropertySet;
import org.apache.poi.hpsf.PropertySetFactory;
import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.hpsf.DocumentSummaryInformation;
import org.apache.poi.poifs.eventfilesystem.POIFSReader;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;

/**
* Office file format Metadata Extracter.  This extracter uses the POI library to extract
* the following:
* <pre>
*   <b>author:</b>             –      cm:author
*   <b>title:</b>              –      cm:title
*   <b>subject:</b>            –      cm:description
*   <b>createDateTime:</b>     –      cm:created
*   <b>lastSaveDateTime:</b>   –      cm:modified
*   <b>comments:</b>
*   <b>editTime:</b>
*   <b>format:</b>
*   <b>keywords:</b>
*   <b>lastAuthor:</b>
*   <b>lastPrinted:</b>
*   <b>osVersion:</b>
*   <b>thumbnail:</b>
*   <b>pageCount:</b>
*   <b>wordCount:</b>
* </pre>
*
* @author Jesper Steen Møller
* @author Derek Hulley
*/
public class MyOfficeMetadataExtracter extends AbstractMappingMetadataExtracter
{
    public static final String KEY_AUTHOR = "author";
    public static final String KEY_TITLE = "title";
    public static final String KEY_SUBJECT = "subject";
    public static final String KEY_CREATE_DATETIME = "createDateTime";
    public static final String KEY_LAST_SAVE_DATETIME = "lastSaveDateTime";
    public static final String KEY_COMMENTS = "comments";
    public static final String KEY_EDIT_TIME = "editTime";
    public static final String KEY_FORMAT = "format";
    public static final String KEY_KEYWORDS = "keywords";
    public static final String KEY_LAST_AUTHOR = "lastAuthor";
    public static final String KEY_LAST_PRINTED = "lastPrinted";
    public static final String KEY_OS_VERSION = "osVersion";
    public static final String KEY_THUMBNAIL = "thumbnail";
    public static final String KEY_PAGE_COUNT = "pageCount";
    public static final String KEY_WORD_COUNT = "wordCount";
    public static final String KEY_CATEGORY = "category";
    public static final String KEY_MANAGER = "manager";
    public static final String KEY_CLIENTE = "cliente";
    public static final String KEY_MATTER = "matter";
    public static final String KEY_LANGUAGE = "language";
   
    public static String[] SUPPORTED_MIMETYPES = new String[] {
        MimetypeMap.MIMETYPE_WORD,
        MimetypeMap.MIMETYPE_EXCEL,
        MimetypeMap.MIMETYPE_PPT};

    public MyOfficeMetadataExtracter()
    {
        super(new HashSet<String>(Arrays.asList(SUPPORTED_MIMETYPES)));
    }

    @Override
    protected Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable
    {
        final Map<String, Serializable> rawProperties = newRawMap();
       
        POIFSReaderListener readerListener = new POIFSReaderListener()
        {
            public void processPOIFSReaderEvent(final POIFSReaderEvent event)
            {
                try
                {
                    PropertySet ps = PropertySetFactory.create(event.getStream());
                    if (ps instanceof SummaryInformation)
                    {
                        SummaryInformation si = (SummaryInformation) ps;
                       
                        putRawValue(KEY_AUTHOR, si.getAuthor(), rawProperties);
                        putRawValue(KEY_TITLE, si.getTitle(), rawProperties);
                        putRawValue(KEY_SUBJECT, si.getSubject(), rawProperties);
                        putRawValue(KEY_CREATE_DATETIME, si.getCreateDateTime(), rawProperties);
                        putRawValue(KEY_LAST_SAVE_DATETIME, si.getLastSaveDateTime(), rawProperties);
                        putRawValue(KEY_COMMENTS, si.getComments(), rawProperties);
                        putRawValue(KEY_EDIT_TIME, si.getEditTime(), rawProperties);
                        putRawValue(KEY_FORMAT, si.getFormat(), rawProperties);
                        putRawValue(KEY_KEYWORDS, si.getKeywords(), rawProperties);
                        putRawValue(KEY_LAST_AUTHOR, si.getLastAuthor(), rawProperties);
                        putRawValue(KEY_LAST_PRINTED, si.getLastPrinted(), rawProperties);
                        putRawValue(KEY_OS_VERSION, si.getOSVersion(), rawProperties);
                        putRawValue(KEY_THUMBNAIL, si.getThumbnail(), rawProperties);
                        putRawValue(KEY_PAGE_COUNT, si.getPageCount(), rawProperties);
                        putRawValue(KEY_WORD_COUNT, si.getWordCount(), rawProperties);
                    }
                }
                catch (Exception ex)
                {
                    throw new ContentIOException("Property set stream: " + event.getPath() + event.getName(), ex);
                }
            // here are the modifications I made…
               try
                {
                    PropertySet psd = PropertySetFactory.create(event.getStream());
                    if (psd instanceof DocumentSummaryInformation)
                    {
                        DocumentSummaryInformation dsi = (DocumentSummaryInformation) psd;
                     
                      putRawValue(KEY_CATEGORY, dsi.getCategory(),rawProperties);
                        putRawValue(KEY_MANAGER, dsi.getManager(),rawProperties);
                        putRawValue(KEY_CLIENTE, dsi.getCustomProperties().get("Cliente"),rawProperties);
                        putRawValue(KEY_MATTER, dsi.getCustomProperties().get("Matter),rawProperties);
                        putRawValue(KEY_LANGUAGE, dsi.getCustomProperties().get("Language"),rawProperties);                       
                    }
                }
                catch (Exception exd)
                {
                    throw new ContentIOException("Property set stream: " + event.getPath() + event.getName(), exd);
                }
                //modifications end here
            }
        };
       
        InputStream is = null;
        try
        {
            is = reader.getContentInputStream();
            POIFSReader poiFSReader = new POIFSReader();
            poiFSReader.registerListener(readerListener, SummaryInformation.DEFAULT_STREAM_NAME);
            poiFSReader.read(is);
        }
        finally
        {
            if (is != null)
            {
                try { is.close(); } catch (IOException e) {}
            }
        }
        return rawProperties;
    }
}

custom-metadata-extrators-context.xml

<?xml version='1.0' encoding='UTF-8'?>
<!DOCTYPE beans PUBLIC '-//SPRING//DTD BEAN//EN' 'http://www.springframework.org/dtd/spring-beans.dtd'>

<!–
       This sample show how to modify the mappings properties of the new V2.1 Metadata Extractors.
       In this example, in addition to the default mappings, the field 'user1' is mapped to
       'cm:description'.  The available source properties is described on the Javadocs of the
       extracter class.
–>
<beans>

    <bean id="extracter.Office" class="org.alfrescox.repo.content.metadata.MyOfficeMetadataExtracter" parent="baseMetadataExtracter" >
        <property name="inheritDefaultMapping">
            <value>true</value>
        </property>
        <property name="mappingProperties">
            <props>
               <prop key="namespace.prefix.cm">http://www.alfresco.org/model/content/1.0</prop>
            <prop key="comments">cm:description</prop>
                <prop key="namespace.prefix.ne">nye.model</prop>
            <prop key="cliente">ne:Cliente</prop>
            <prop key="matter">ne:Matter</prop>
            <prop key="subject">ne:Practica</prop>
            <prop key="category">ne:Documento</prop>
            <prop key="keywords">ne:Keywords</prop>
            <prop key="manager">ne:SocioR</prop>
            <prop key="language">ne:Idioma</prop>
            </props>
        </property>
    </bean>

</beans>


Thanks,

Javier Gonzalez de Aragon
3 REPLIES 3

msporled
Champ in-the-making
Champ in-the-making
Hey, your code doesn't work.  Smiley Happy  I'm trying to do the same thing and it looks like you're also stuck using DocumentSummaryInformation and SummaryInformation in the same class.  Any luck?

Did you ever figure out how to deploy this?  You need to download the SDK, open it up in eclipse, and start a new project.

grégo
Champ in-the-making
Champ in-the-making
Hi,

I had the same problem and found the solution :

Your code doesn't work because the poiFSReader is only reading the SummaryInformation stream. If you want to get the DocumentSummaryInformation too, you must specify it when you register the listener.


poiFSReader.registerListener(readerListener, SummaryInformation.DEFAULT_STREAM_NAME);
poiFSReader.registerListener(readerListener, DocumentSummaryInformation.DEFAULT_STREAM_NAME);

msporled
Champ in-the-making
Champ in-the-making
For completeness, this works for me:
    /*
    * Copyright (C) 2005 Jesper Steen Møller
    *
    * This program is free software; you can redistribute it and/or
    * modify it under the terms of the GNU General Public License
    * as published by the Free Software Foundation; either version 2
    * of the License, or (at your option) any later version.

    * This program is distributed in the hope that it will be useful,
    * but WITHOUT ANY WARRANTY; without even the implied warranty of
    * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    * GNU General Public License for more details.

    * You should have received a copy of the GNU General Public License
    * along with this program; if not, write to the Free Software
    * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.

    * As a special exception to the terms and conditions of version 2.0 of
    * the GPL, you may redistribute this Program in connection with Free/Libre
    * and Open Source Software ("FLOSS") applications as described in Alfresco's
    * FLOSS exception.  You should have recieved a copy of the text describing
    * the FLOSS exception, and it is also available here:
    * http://www.alfresco.com/legal/licensing"
    */
    //package org.alfresco.repo.content.metadata;

    package com.mycompany.extract;

    import java.io.IOException;
    import java.io.InputStream;
    import java.io.Serializable;
    import java.util.Arrays;
    import java.util.HashSet;
    import java.util.Map;

    import org.alfresco.repo.content.MimetypeMap;
    import org.alfresco.repo.content.metadata.AbstractMappingMetadataExtracter;
    import org.alfresco.service.cmr.repository.ContentIOException;
    import org.alfresco.service.cmr.repository.ContentReader;
    import org.apache.poi.hpsf.PropertySet;
    import org.apache.poi.hpsf.PropertySetFactory;
    import org.apache.poi.hpsf.SummaryInformation;
    import org.apache.poi.hpsf.DocumentSummaryInformation;
    import org.apache.poi.poifs.eventfilesystem.POIFSReader;
    import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
    import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;

    /**
    * Office file format Metadata Extracter.  This extracter uses the POI library to extract
    * the following:
    * <pre>
    *   <b>author:</b>             –      cm:author
    *   <b>title:</b>              –      cm:title
    *   <b>subject:</b>            –      cm:description
    *   <b>createDateTime:</b>     –      cm:created
    *   <b>lastSaveDateTime:</b>   –      cm:modified
    *   <b>comments:</b>
    *   <b>editTime:</b>
    *   <b>format:</b>
    *   <b>keywords:</b>
    *   <b>lastAuthor:</b>
    *   <b>lastPrinted:</b>
    *   <b>osVersion:</b>
    *   <b>thumbnail:</b>
    *   <b>pageCount:</b>
    *   <b>wordCount:</b>
    * </pre>
    *
    * @author Jesper Steen Møller
    * @author Derek Hulley
    */
    public class MyOfficeMetadataExtracter extends AbstractMappingMetadataExtracter
    {
        public static final String KEY_AUTHOR = "author";
        public static final String KEY_TITLE = "title";
        public static final String KEY_SUBJECT = "subject";
        public static final String KEY_CREATE_DATETIME = "createDateTime";
        public static final String KEY_LAST_SAVE_DATETIME = "lastSaveDateTime";
        public static final String KEY_COMMENTS = "comments";
        public static final String KEY_EDIT_TIME = "editTime";
        public static final String KEY_FORMAT = "format";
        public static final String KEY_KEYWORDS = "keywords";
        public static final String KEY_LAST_AUTHOR = "lastAuthor";
        public static final String KEY_LAST_PRINTED = "lastPrinted";
        public static final String KEY_OS_VERSION = "osVersion";
        public static final String KEY_THUMBNAIL = "thumbnail";
        public static final String KEY_PAGE_COUNT = "pageCount";
        public static final String KEY_WORD_COUNT = "wordCount";
       
        public static final String KEY_CUSTOM1 = "Custom1";
        public static final String KEY_CUSTOM2 = "Custom2";
        public static final String KEY_FOO = "foo";
       
      
        public static String[] SUPPORTED_MIMETYPES = new String[] {
            MimetypeMap.MIMETYPE_WORD,
            MimetypeMap.MIMETYPE_EXCEL,
            MimetypeMap.MIMETYPE_PPT};

        public MyOfficeMetadataExtracter()
        {
            super(new HashSet<String>(Arrays.asList(SUPPORTED_MIMETYPES)));
        }

        @Override
        protected Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable
        {
            final Map<String, Serializable> rawProperties = newRawMap();
          
            POIFSReaderListener readerListener1 = new POIFSReaderListener()
            {
                public void processPOIFSReaderEvent(final POIFSReaderEvent event)
                {
                   
                    try
                    {
                        PropertySet ps = PropertySetFactory.create(event.getStream());
                        if (ps instanceof SummaryInformation)
                        {
                            SummaryInformation si = (SummaryInformation) ps;
                          
                            putRawValue(KEY_AUTHOR, si.getAuthor(), rawProperties);
                            putRawValue(KEY_TITLE, si.getTitle(), rawProperties);
                            putRawValue(KEY_SUBJECT, si.getSubject(), rawProperties);
                            putRawValue(KEY_CREATE_DATETIME, si.getCreateDateTime(), rawProperties);
                            putRawValue(KEY_LAST_SAVE_DATETIME, si.getLastSaveDateTime(), rawProperties);
                            putRawValue(KEY_COMMENTS, si.getComments(), rawProperties);
                            putRawValue(KEY_EDIT_TIME, si.getEditTime(), rawProperties);
                            putRawValue(KEY_FORMAT, si.getFormat(), rawProperties);
                            putRawValue(KEY_KEYWORDS, si.getKeywords(), rawProperties);
                            putRawValue(KEY_LAST_AUTHOR, si.getLastAuthor(), rawProperties);
                            putRawValue(KEY_LAST_PRINTED, si.getLastPrinted(), rawProperties);
                            putRawValue(KEY_OS_VERSION, si.getOSVersion(), rawProperties);
                            putRawValue(KEY_THUMBNAIL, si.getThumbnail(), rawProperties);
                            putRawValue(KEY_PAGE_COUNT, si.getPageCount(), rawProperties);
                            putRawValue(KEY_WORD_COUNT, si.getWordCount(), rawProperties);
                        }
                    }
                    catch (Exception ex)
                    {
                        throw new ContentIOException("Property set stream SummaryInformation: " + event.getPath() + event.getName(), ex);
                    }

                }
            };
          
            POIFSReaderListener readerListener2 = new POIFSReaderListener()
            {
                public void processPOIFSReaderEvent(final POIFSReaderEvent event)
                {
                   

                   try
                    {
                        PropertySet psd = PropertySetFactory.create(event.getStream());
                        if (psd instanceof DocumentSummaryInformation)
                        {
                            DocumentSummaryInformation dsi = (DocumentSummaryInformation) psd;
                              putRawValue(KEY_CUSTOM1, (Serializable) dsi.getCustomProperties().get("Custom1"), rawProperties);
                              putRawValue(KEY_CUSTOM2, (Serializable) dsi.getCustomProperties().get("Custom2"), rawProperties);
                              putRawValue(KEY_FOO, (Serializable) dsi.getCustomProperties().get("Foo"), rawProperties);
                        }
                    }
                    catch (Exception exd)
                    {
                        throw new ContentIOException("Property set stream DocumentSummaryInfomration: " + event.getPath() + event.getName(), exd);
                    }

                }
            };
          
            InputStream is = null;
            try
            {
                is = reader.getContentInputStream();
                POIFSReader poiFSReader = new POIFSReader();
                poiFSReader.registerListener(readerListener1, SummaryInformation.DEFAULT_STREAM_NAME);
                poiFSReader.registerListener(readerListener2, DocumentSummaryInformation.DEFAULT_STREAM_NAME);
                poiFSReader.read(is);
            }
            finally
            {
                if (is != null)
                {
                    try { is.close(); } catch (IOException e) {}
                }
            }
            return rawProperties;
        }
    }

<?xml version='1.0' encoding='UTF-8'?>
<!DOCTYPE beans PUBLIC '-//SPRING//DTD BEAN//EN' 'http://www.springframework.org/dtd/spring-beans.dtd'>
                                                                                                                                              <!–
           This sample show how to modify the mappings properties of the new V2.1 Metadata Extractors.
           In this example, in addition to the default mappings, the field 'user1' is mapped to
           'cm:description'.  The available source properties is described on the Javadocs of the
           extracter class.
    –>
<beans>

    <bean id="extracter.Office" class="com.mycompany.extract.MyOfficeMetadataExtracter" parent="baseMetadataExtracter" >
        <property name="inheritDefaultMapping">
            <value>true</value>
        </property>
        <property name="mappingProperties">
            <props>
               <prop key="namespace.prefix.cm">http://www.alfresco.org/model/content/1.0</prop>
               <prop key="namespace.prefix.mymodel">com.mycompany.model</prop>  <!– defined in myCustomModel.xml as namespace uri –>
               <prop key="author">cm:author</prop>
               <prop key="title">cm:title</prop>
               <prop key="subject">cm:description</prop>
               <prop key="createDateTime">cm:created</prop>
               <prop key="lastSaveDateTime">cm:modified</prop>
               <prop key="Custom1">mymodel:Custom1</prop>
               <prop key="Custom2">mymodel:Custom2</prop>
               <prop key="Foo">mymodel:Foo</prop>
            </props>
        </property>
    </bean>

</beans>