How to programmatically extract mathematical formulas from Word with Mathematica 9?

Get all the files here.

.NET Mathematica Word Library

You will need to use a Microsoft library to open word documents. In a language such as .Net it is very easy; just open Visual Studio, reference the Microsoft.Office.Interop.Word .NET DLL (for Words) and the C:\Program Files\Open XML SDK\V2.5\lib\DocumentFormat.OpenXml.dll (for Formulas in the MathML format). Then you build this C# code:

using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Runtime.InteropServices;
using System.Text;
using System.Xml;
using System.Xml.Xsl;
using DocumentFormat.OpenXml.Packaging;
using Microsoft.Office.Interop.Word;

namespace MathematicaWordHelper
{
    public class WordHelper
    {
        /// <summary>
        /// Opens a Microsoft Word Document and returns the content of words
        /// </summary>
        /// <param name="docFilePath"></param>
        /// <returns></returns>
        public string GetWordDocumentText(string docFilePath)
        {
            string output = string.Empty;

            // Open word
            _Application oWord = new Application();
            _Document oDoc = oWord.Documents.Open(docFilePath, ReadOnly: true);

            // Get the Documents text
            output = oDoc.Content.Text.ToString();

            // Close word
            oDoc.Close();
            oWord.Quit(false);
            Marshal.ReleaseComObject(oDoc);
            Marshal.ReleaseComObject(oWord);

            // Return the text to Mathematica calling code
            return output;
        }

        /// <summary>
        /// This is an overloaded method for ease of use (on most PCs where MS Word is installed in the default location)
        /// </summary>
        /// <param name="docFilePath"></param>
        /// <param name="officeVersion"></param>
        /// <returns></returns>
        public string GetWordDocumentAsMathML(string docFilePath, int officeVersion = 15)
        {
            return GetWordDocumentAsMathML(docFilePath,
                                    @"c:\Program Files\Microsoft Office\Office" + officeVersion.ToString() +
                                    @"\OMML2MML.XSL");
        }

        /// <summary>
        /// This returns one formula of all the Equations in a Microsoft Document in Math ML format, ref: http://en.wikipedia.org/wiki/MathML
        /// </summary>
        /// <param name="docFilePath"></param>
        /// <param name="officeMathMLSchemaFilePath"></param>
        /// <returns></returns>
        public string GetWordDocumentAsMathML(string docFilePath, string officeMathMLSchemaFilePath = @"c:\Program Files\Microsoft Office\Office15\OMML2MML.XSL")
        {
            string officeMLFormulaAllTogether = string.Empty;
            using (WordprocessingDocument doc = WordprocessingDocument.Open(docFilePath, false))
            {
                string wordDocXml = doc.MainDocumentPart.Document.OuterXml;

                XslCompiledTransform xslTransform = new XslCompiledTransform();
                xslTransform.Load(officeMathMLSchemaFilePath);

                using (TextReader tr = new StringReader(wordDocXml))
                {
                    // Load the xml of your main document part.
                    using (XmlReader reader = XmlReader.Create(tr))
                    {
                        using (MemoryStream ms = new MemoryStream())
                        {
                            XmlWriterSettings settings = xslTransform.OutputSettings.Clone();

                            // Configure xml writer to omit xml declaration.
                            settings.ConformanceLevel = ConformanceLevel.Fragment;
                            settings.OmitXmlDeclaration = true;

                            XmlWriter xw = XmlWriter.Create(ms, settings);

                            // Transform our OfficeMathML to MathML.
                            xslTransform.Transform(reader, xw);
                            ms.Seek(0, SeekOrigin.Begin);

                            using (StreamReader sr = new StreamReader(ms, Encoding.UTF8))
                            {
                                officeMLFormulaAllTogether = sr.ReadToEnd();
                            }
                        }
                    }
                }
            }
            return officeMLFormulaAllTogether;
        }

        /// <summary>
        /// This is an overloaded method for ease of use (on most PCs where MS Word is installed in the default location)
        /// </summary>
        /// <param name="docFilePath"></param>
        /// <param name="officeVersion"></param>
        /// <returns></returns>
        public string[] GetWordDocumentAsMathMLFormulas(string docFilePath, int officeVersion = 15)
        {
            return GetWordDocumentAsMathMLFormulas(docFilePath,
                                    @"c:\Program Files\Microsoft Office\Office" + officeVersion.ToString() +
                                    @"\OMML2MML.XSL");
        }

        /// <summary>
        /// This returns a string array of all the separate Equations in a Microsoft Document in Math ML format, ref: http://en.wikipedia.org/wiki/MathML
        /// </summary>
        /// <param name="docFilePath"></param>
        /// <param name="officeMathMLSchemaFilePath"></param>
        /// <returns></returns>
        public string[] GetWordDocumentAsMathMLFormulas(string docFilePath, string officeMathMLSchemaFilePath = @"c:\Program Files\Microsoft Office\Office15\OMML2MML.XSL")
        {
            List<string> officeMLFormulas = new List<string>();
            using (WordprocessingDocument doc = WordprocessingDocument.Open(docFilePath, false))
            {
                foreach (var formula in doc.MainDocumentPart.Document.Descendants<DocumentFormat.OpenXml.Math.Paragraph>())
                {
                    string wordDocXml = formula.OuterXml;

                    XslCompiledTransform xslTransform = new XslCompiledTransform();
                    xslTransform.Load(officeMathMLSchemaFilePath);

                    using (TextReader tr = new StringReader(wordDocXml))
                    {
                        // Load the xml of your main document part.
                        using (XmlReader reader = XmlReader.Create(tr))
                        {
                            using (MemoryStream ms = new MemoryStream())
                            {
                                XmlWriterSettings settings = xslTransform.OutputSettings.Clone();

                                // Configure xml writer to omit xml declaration.
                                settings.ConformanceLevel = ConformanceLevel.Fragment;
                                settings.OmitXmlDeclaration = true;

                                XmlWriter xw = XmlWriter.Create(ms, settings);

                                // Transform our OfficeMathML to MathML.
                                xslTransform.Transform(reader, xw);
                                ms.Seek(0, SeekOrigin.Begin);

                                using (StreamReader sr = new StreamReader(ms, Encoding.UTF8))
                                {
                                    officeMLFormulas.Add(sr.ReadToEnd());
                                }
                            }
                        }
                    }
                }
            }
            return officeMLFormulas.ToArray();
        }
    }
}

Calling .NET from Mathematica

In a Mathematica NoteBook (or etc) you reference the .NET Mathematica Word Library DLL (built with the above C# code) and to get the text in the Word document using this code:

<< NetLink`
InstallNET[]
LoadNETAssembly["c:\\temp\\MmaWord\\MathematicaWordHelper.dll"] 
obj = NETNew["MathematicaWordHelper.WordHelper"];

wordsInDocument = obj@GetWordDocumentText["C:\\temp\\MmaWord\\WordDocWithFormulas.docx"]

Result

Formula in Word: enter image description here

Fetching text into Mathematica Notebook: enter image description here

Refer to the guide for more help: http://reference.wolfram.com/language/NETLink/tutorial/Overview.html http://reference.wolfram.com/language/NETLink/tutorial/CallingNETFromTheWolframLanguage.html#23489

Importing the formulas (as words not XML Math ML) from Word is formatted incorrectly

enter image description here

OK, I see the problem you are having with equations involving two-dimensional layout structures, Fortunately our friendly fellow Mathematica community members have suggested MathML to the rescue.

P.S. this is a well known issue with Microsoft and Wolfram, for example if you copy a Mathematica line into Word or Outlook it comes out in this weird format. And as we see above, fetching data from MS Word into Mathematica renders in an even more misinterpreted format.

The MathML XML Method

I added GetWordDocumentAsMathML and GetWordDocumentAsMathMLFormulas methods and included the referenced DLLs and the .NET Project in the download: http://JeremyThompson.net/Rocks/Mathematica/MmaWord.zip

So now we try to get the formula from Mathematica:

s1 = obj@GetWordDocumentAsMathML[
   "C:\\temp\\MmaWord\\FormulaExamples.docx", "15"]

ImportString[
  StringReplace[
   s1, {"mml:" -> "", Except[StartOfString, "<"] -> "\n<"}], 
  "MathML"] // ToExpression[#1, StandardForm, HoldForm] & 

But oh no, it combines all the formula's:

enter image description here

In this case we need to call the third .NET DLL method GetWordDocumentAsMathMLFormulas from Mathematica (this time I am using the overload which allows me to specify the full path of the XSL file), both methods have these overloads as per the C# code:

s2 = obj@GetWordDocumentAsMathMLFormulas[
   "C:\\temp\\MmaWord\\FormulaExamples.docx", 
   "c:\\Program Files\\Microsoft Office\\Office15\\OMML2MML.XSL"]

ImportString[
  StringReplace[
   Last[s2], {"mml:" -> "", Except[StartOfString, "<"] -> "\n<"}], 
  "MathML"] // ToExpression[#1, StandardForm, HoldForm] &

Pay attention to "Last[s2]" in the above Mathematica query

In summary we now have three methods to extract data from Word. 1. Get the Words. 2. Get the Equations altogether. 3. Get the Equations as a string array.

enter image description here

Why don't I get any MathML returned?

If only the header MathML XML is returned, it is because there are no equations in the document:

<mml:math xmlns:mml="w3.org/1998/Math/MathML"; xmlns:m="schemas.openxmlformats.org/officeDocument/2006/math"; /> 

enter image description here


@WolframFan The answer is great, help me understanding the XML document format. Here is my codes.

Part 1 If linear formulas wanted, try this

<< NETLink`
fGetDocLinearFormulas[file_String, docVisible_?BooleanQ] :=
 Module[{doc, oms, bar,i},
  (*load types*)
  LoadNETType["System.Windows.Forms.Clipboard"];
  LoadNETType["System.Windows.Forms.TextDataFormat"];

  (*create app*)
  doc = CreateCOMObject["Word.Application"];
  If[! NETObjectQ@doc, Return[$Failed]];

  (*open file*)
  doc@Visible = docVisible;
  doc@Documents@Open[file];
  oms = doc@ActiveDocument@Content@OMaths;
  bar = doc@ActiveDocument@CommandBars;

  (*listing*)
  For[i = 1, i <= oms@Count, i++,
   oms@Item[i]@Range@Select[];
   bar@ExecuteMso["Copy"];
   FrontEndTokenExecute["Paste"];
   Clipboard`SetText["\n"];
   FrontEndTokenExecute["Paste"];
   ];
  (*cleaning*)
  doc@Quit[];
  (*no return*)
  ]
fGetDocLinearFormulas[file_String] := fGetDocLinearFormulas[file, False]


Import["! taskkill /im winword.exe /f ", "Text"];
fGetDocLinearFormulas["E:\\formula examples.docx", True]

enter image description here

and the result be shown in Text cell.

enter image description here

for many files in directories, do

 fGetDocLinearFormulas /@ FileNames["*.docx", {"D:\\","E:\\"}]

Part 2

Or if MathML formulas, some options should be set up following this

enter image description here

then try this

<< NETLink`

fGetDocMathMLFormulas[file_String, docVisible_?BooleanQ] :=
 Module[{doc, oms, bar, fmlist, mml,i, fmlist},
  (*load types*)
  LoadNETType["System.Windows.Forms.Clipboard"];
  LoadNETType["System.Windows.Forms.TextDataFormat"];

  (*create app*)
  doc = CreateCOMObject["Word.Application"];
  If[! NETObjectQ@doc, Return[$Failed]];

  (*open file*)
  doc@Visible = docVisible;
  doc@Documents@Open[file];
  oms = doc@ActiveDocument@Content@OMaths;
  bar = doc@ActiveDocument@CommandBars;

  (*listing*)
  fmlist = {};
  For[i = 1, i <= oms@Count, i++,
   oms@Item[i]@Range@Select[];
   bar@ExecuteMso["Copy"];
   mml = Clipboard`GetText[TextDataFormat`UnicodeText];
   AppendTo[fmlist, DisplayForm@ImportString[mml, "MathML"]];
   ];
  (*cleaning*)
  doc@Quit[];
  (*return*)
  Return[fmlist];
  ]
fGetDocMathMLFormulas[file_String] := fGetDocMathMLFormulas[file, False]


Import["! taskkill /im winword.exe /f ", "Text"];
fGetDocMathMLFormulas["E:\\formula examples.docx",True]

enter image description here

for many files in directories, do

 fGetDocMathMLFormulas /@ FileNames["*.docx", {"D:\\","E:\\"}]