Skip to content

Commit

Permalink
Merge pull request #251 from metanorma/table_width_fix
Browse files Browse the repository at this point in the history
table width algorithm performance optimization, #245
  • Loading branch information
Intelligent2013 authored May 26, 2024
2 parents ae5925f + 4f0be7c commit 5a3d9d8
Show file tree
Hide file tree
Showing 6 changed files with 131 additions and 52 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ SHELL ?= /bin/bash
endif

#JAR_VERSION := $(shell mvn -q -Dexec.executable="echo" -Dexec.args='$${project.version}' --non-recursive exec:exec -DforceStdout)
JAR_VERSION := 1.88
JAR_VERSION := 1.90
JAR_FILE := mn2pdf-$(JAR_VERSION).jar

all: target/$(JAR_FILE)
Expand Down
10 changes: 5 additions & 5 deletions README.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@ You will need the Java Development Kit (JDK) version 8, Update 241 (8u241) or hi

[source,sh]
----
java -Xss5m -Xmx2048m -jar target/mn2pdf-1.89.jar --xml-file <XML-FileName> --xsl-file <XSLT-FileName> --pdf-file <Output-PDF-FileName> [--syntax-highlight]
java -Xss5m -Xmx2048m -jar target/mn2pdf-1.90.jar --xml-file <XML-FileName> --xsl-file <XSLT-FileName> --pdf-file <Output-PDF-FileName> [--syntax-highlight]
----

e.g.

[source,sh]
----
java -Xss5m -Xmx2048m -jar target/mn2pdf-1.89.jar --xml-file tests/G.191.xml --xsl-file tests/itu.recommendation.xsl --pdf-file tests/G.191.pdf
java -Xss5m -Xmx2048m -jar target/mn2pdf-1.90.jar --xml-file tests/G.191.xml --xsl-file tests/itu.recommendation.xsl --pdf-file tests/G.191.pdf
----

=== PDF encryption features
Expand Down Expand Up @@ -100,7 +100,7 @@ Update version in `pom.xml`, e.g.:
----
<groupId>org.metanorma.fop</groupId>
<artifactId>mn2pdf</artifactId>
<version>1.89</version>
<version>1.90</version>
<name>Metanorma XML to PDF converter</name>
----

Expand All @@ -111,8 +111,8 @@ Tag the same version in Git:

[source,xml]
----
git tag v1.89
git push origin v1.89
git tag v1.90
git push origin v1.90
----

Then the corresponding GitHub release will be automatically created at:
Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>org.metanorma.fop</groupId>
<artifactId>mn2pdf</artifactId>
<version>1.89</version>
<version>1.90</version>
<name>Metanorma XML to PDF converter</name>
<packaging>jar</packaging>
<url>https://www.metanorma.org</url>
Expand Down
84 changes: 65 additions & 19 deletions src/main/java/org/metanorma/fop/PDFGenerator.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import java.util.*;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Collectors;
import javax.xml.parsers.*;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Result;
Expand Down Expand Up @@ -277,7 +278,8 @@ public boolean process() {
isAddMathAsText = xsltConverter.hasParamAddMathAsText() && isMathExists;
isAddMathAsAttachment = xsltConverter.hasParamAddMathAsAttachment();

isApplyAutolayoutAlgorithm = xsltConverter.isApplyAutolayoutAlgorithm();
isApplyAutolayoutAlgorithm = xsltConverter.isApplyAutolayoutAlgorithm();


if (isSyntaxHighlight) {
xsltParams.put("syntax-highlight", "true");
Expand Down Expand Up @@ -1173,6 +1175,8 @@ private void readEncryptionParameters(File fEncryptionParameters) {

private void setTablesWidths(fontConfig fontcfg, XSLTconverter xsltConverter, File pdf) {

int TABLE_CELLS_COUNT_MAX = 30000;

String methodName = getClass().getSimpleName() + "." + (new Object(){}.getClass().getEnclosingMethod().getName());
Profiler.addMethodCall(methodName);
long startMethodTime = System.currentTimeMillis();
Expand All @@ -1196,7 +1200,7 @@ private void setTablesWidths(fontConfig fontcfg, XSLTconverter xsltConverter, Fi
SourceXMLDocument sourceXMLDocumentTablesOnly = new SourceXMLDocument(xmlTablesOnly);

int countTableCells = sourceXMLDocumentTablesOnly.getCountTableCells();
if (countTableCells < 30000) {
if (countTableCells < TABLE_CELLS_COUNT_MAX) {
// transform XML to XSL-FO (XML .fo file)
xsltConverter.transform(sourceXMLDocumentTablesOnly, false);

Expand All @@ -1218,46 +1222,79 @@ private void setTablesWidths(fontConfig fontcfg, XSLTconverter xsltConverter, Fi

} else { // for large tables, or large number of tables

List<String> tablesIds = sourceXMLDocumentTablesOnly.readElementsIds("//*[local-name() = 'table' or local-name() = 'dl']");

List<String> xmlTablesIF = new ArrayList<>();
// process each table separatery for memory consumption optimization
int tableCounter = 0;
int tableCount = tablesIds.size();
for (String tableId : tablesIds) {
tableCounter++;
logger.info("[INFO] Generation of XSL-FO (" + tableCounter + "/" + tableCount + ") with information about the table widths with id='" + tableId + "'...");

// process table with id=tableId only
xsltConverter.setParam("table_only_with_id", tableId);
Map<String,Integer> tablesCellsCountMap = sourceXMLDocumentTablesOnly.getTablesCellsCountMap();

int portion = 1;
while(!tablesCellsCountMap.isEmpty()) {
int totalCells = 0;
List<String> tablesProcessed = new ArrayList<>();

Iterator<Map.Entry<String, Integer>> iterator = tablesCellsCountMap.entrySet().iterator();
while (iterator.hasNext() && totalCells < TABLE_CELLS_COUNT_MAX) {
Map.Entry<String, Integer> entry = iterator.next();
if (totalCells == 0 || totalCells + entry.getValue() < TABLE_CELLS_COUNT_MAX) {
totalCells += entry.getValue();
tablesProcessed.add(entry.getKey());
}
}

/*for (Map.Entry<String, Integer> entry : tablesCellsCountMap.entrySet()) {
else {
break;
}
}*/
logger.info("[INFO] Generation of XSL-FO (portion " + portion + ") with information about the table widths...");

// "table1 table2 table3 " (with space at the end)
String tableIds = tablesProcessed.stream().collect(Collectors.joining(" ")) + " ";
// call XSLT and pass the tables ids

// process table with ids=tableIds only
xsltConverter.setParam("table_only_with_ids", tableIds);

// transform XML to XSL-FO (XML .fo file)
xsltConverter.transform(sourceXMLDocumentTablesOnly, false);

String xmlFO = sourceXMLDocumentTablesOnly.getXMLFO();

//debug
debugSaveXML(xmlFO, pdf.getAbsolutePath() + "." + tableId + ".fo.tables.xml");
debugSaveXML(xmlFO, pdf.getAbsolutePath() + ".portion_" + portion + ".fo.tables.xml");

fontcfg.outputFontManifestLog(Paths.get(pdf.getAbsolutePath() + "." + tableId + ".tables.fontmanifest.log.txt"));
fontcfg.outputFontManifestLog(Paths.get(pdf.getAbsolutePath() + ".portion_" + portion + ".tables.fontmanifest.log.txt"));

fontcfg.setSourceDocumentFontList(sourceXMLDocumentTablesOnly.getDocumentFonts());

Source sourceFO = new StreamSource(new StringReader(xmlFO));

logger.info("[INFO] Generation of Intermediate Format (" + tableCounter + "/" + tableCount + ") with information about the table's widths with id='" + tableId + "'...");
String xmlIF = generateFOPIntermediateFormat(sourceFO, fontcfg.getConfig(), pdf, true, "." + tableId + ".tables");
logger.info("[INFO] Generation of Intermediate Format with information about the table's widths (portion " + portion + ") ...");
String xmlIF = generateFOPIntermediateFormat(sourceFO, fontcfg.getConfig(), pdf, true, ".portion_" + portion + ".tables");

xmlTableIF = createTableIF(xmlIF);

debugSaveXML(xmlTableIF, pdf.getAbsolutePath() + "." + tableId + ".tables.xml");
debugSaveXML(xmlTableIF, pdf.getAbsolutePath() + ".portion_" + portion + ".tables.xml");

xmlTableIF = tableWidthsCleanup(xmlTableIF);

xmlTablesIF.add(xmlTableIF);

// remove processed tables
tablesCellsCountMap.keySet().removeAll(tablesProcessed);
portion++;
}

/*List<String> tablesIds = sourceXMLDocumentTablesOnly.readElementsIds("//*[local-name() = 'table' or local-name() = 'dl']");
// process each table separatery for memory consumption optimization
int tableCounter = 0;
int tableCount = tablesIds.size();
for (String tableId : tablesIds) {
tableCounter++;
logger.info("[INFO] Generation of XSL-FO (" + tableCounter + "/" + tableCount + ") with information about the table widths with id='" + tableId + "'...");
}*/
xmlTableIF = tablesWidthsUnion(xmlTablesIF);
xsltConverter.setParam("table_only_with_id", ""); // further process all tables
xsltConverter.setParam("table_only_with_ids", ""); // further process all tables
}

debugSaveXML(xmlTableIF, pdf.getAbsolutePath() + ".tables.xml");
Expand Down Expand Up @@ -1330,11 +1367,17 @@ private void saveDebugFO(String debugXSLFO) {
}

private String tableWidthsCleanup(String table) {
int startPos = table.indexOf("<table ");
try {
table = applyXSLT("table_if_clean.xsl", table, false);
} catch (Exception ex) {
logger.severe("Can't simplify the tables width information XML.");
ex.printStackTrace();
}
/*int startPos = table.indexOf("<table ");
int endPos = table.indexOf("</tables>");
table = table.substring(startPos, endPos);
int startPosTbody = table.indexOf("<tbody>");
table = table.substring(0,startPosTbody) + "</table>";
table = table.substring(0,startPosTbody) + "</table>";*/
return table;
}

Expand All @@ -1344,6 +1387,9 @@ private String tablesWidthsUnion(List<String> tables) {
sbTablesIF.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?><tables>");
}
for (String itemTableIF: tables) {
int startPos = itemTableIF.indexOf("<table ");
int endPos = itemTableIF.indexOf("</tables>");
itemTableIF = itemTableIF.substring(startPos, endPos);
sbTablesIF.append(itemTableIF);
}
if (!tables.isEmpty()) {
Expand Down
72 changes: 46 additions & 26 deletions src/main/java/org/metanorma/fop/SourceXMLDocument.java
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package org.metanorma.fop;

import java.io.*;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
Expand All @@ -14,10 +13,7 @@
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.xml.parsers.*;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Source;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.*;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;
Expand All @@ -26,7 +22,7 @@
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import static org.metanorma.fop.PDFGenerator.logger;

import static org.metanorma.fop.Util.getStreamFromResources;

import org.metanorma.utils.LoggerHelper;
Expand All @@ -52,6 +48,7 @@ public class SourceXMLDocument {

private boolean hasAnnotations = false;
private boolean hasTables = false;
private Map<String, Integer> tablesCellsCountMap = new HashMap<>();
private boolean hasMath = false;

static final String TMPDIR = System.getProperty("java.io.tmpdir");
Expand Down Expand Up @@ -89,6 +86,7 @@ public SourceXMLDocument(String strXML) {
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
InputSource xmlIFIS = new InputSource(new StringReader(strXML));
sourceXML = dBuilder.parse(xmlIFIS);
readMetaInformation();
} catch (Exception ex) {
logger.severe("Can't parse source XML.");
ex.printStackTrace();
Expand All @@ -97,14 +95,41 @@ public SourceXMLDocument(String strXML) {

private void readMetaInformation() {
String element_review = readValue("//*[local-name() = 'review'][1]");
this.hasAnnotations = element_review.length() != 0;
// check table without colgroup/col (width) or dl
String element_table = readValue("//*[(local-name() = 'table' and not(*[local-name() = 'colgroup']/*[local-name() = 'col'])) or local-name() = 'dl'][1]");
this.hasTables = element_table.length() != 0;
hasAnnotations = element_review.length() != 0;
String element_math = readValue("//*[local-name() = 'math'][1]");
this.hasMath = element_math.length() != 0;
hasMath = element_math.length() != 0;
//tables without colgroup/col (width) or dl
//String element_table = readValue("//*[(local-name() = 'table' and not(*[local-name() = 'colgroup']/*[local-name() = 'col'])) or local-name() = 'dl'][1]");
//hasTables = element_table.length() != 0;
obtainTablesCellsCount();
hasTables = !tablesCellsCountMap.isEmpty();
}

private void obtainTablesCellsCount() {
try {
XPath xPathAllTable = XPathFactory.newInstance().newXPath();
// select all tables (without colgroup) and definitions lists (dl)
XPathExpression queryAllTables = xPathAllTable.compile("//*[(local-name() = 'table' and not(*[local-name() = 'colgroup']/*[local-name() = 'col'])) or local-name() = 'dl']");
NodeList nodesTables = (NodeList)queryAllTables.evaluate(sourceXML, XPathConstants.NODESET);
for (int i = 0; i < nodesTables.getLength(); i++) {
Node nodeTable = nodesTables.item(i);
String tableId = "";
Node nodeId = nodeTable.getAttributes().getNamedItem("id");
if (nodeId != null) {
tableId =nodeId.getTextContent();
}
if (!tableId.isEmpty()) {
XPath xPathTableCountCells = XPathFactory.newInstance().newXPath();
XPathExpression queryTableCountCells = xPathTableCountCells.compile(".//*[local-name() = 'td' or local-name() = 'th' or local-name() = 'dt' or local-name() = 'dd']");
NodeList nodesCells = (NodeList) queryTableCountCells.evaluate(nodeTable, XPathConstants.NODESET);
int countCells = nodesCells.getLength();
tablesCellsCountMap.put(tableId, countCells);
}
}
} catch (XPathExpressionException ex) {
logger.severe(ex.toString());
}
}

public StreamSource getStreamSource() {
if (sourceXMLstr.isEmpty()) {
Expand Down Expand Up @@ -411,7 +436,6 @@ public String getDocumentFilePath() {
return documentFilePath;
}


private String updatePreprocessXSLT(Document docXML) throws Exception {

Source srcXSL = new StreamSource(getStreamFromResources(getClass().getClassLoader(), "update_preprocess_xslt.xsl"));
Expand Down Expand Up @@ -441,19 +465,6 @@ private String readValue(String xpath) {
return value;
}

private int readTableCellsCount(){
int count = 0;
try {
XPath xPath = XPathFactory.newInstance().newXPath();
XPathExpression query = xPath.compile("//*[local-name() = 'td' or local-name() = 'th' or local-name() = 'dt' or local-name() = 'dd']");
NodeList nodes = (NodeList)query.evaluate(sourceXML, XPathConstants.NODESET);
count = nodes.getLength();
} catch (Exception ex) {
logger.severe(ex.toString());
}
return count;
}

public List<String> readElementsIds(String xpath) {
List<String> values = new ArrayList<>();
try {
Expand Down Expand Up @@ -487,7 +498,12 @@ public boolean hasMath() {
}

public int getCountTableCells() {
int countTableCells = readTableCellsCount();
int countTableCells = 0;
try {
countTableCells = tablesCellsCountMap.values().stream().mapToInt(Integer::intValue).sum();
} catch (Exception ex) {
logger.severe(ex.toString());
};
return countTableCells;
}

Expand All @@ -496,4 +512,8 @@ public void flushResources() {
sourceXMLstr = "";
xmlFO = null;
}

public Map<String, Integer> getTablesCellsCountMap() {
return tablesCellsCountMap;
}
}
13 changes: 13 additions & 0 deletions src/test/java/org/metanorma/fop/SourceXMLDocumentTests.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.xml.transform.TransformerException;
import org.apache.commons.cli.ParseException;

Expand Down Expand Up @@ -92,4 +94,15 @@ public void testGetDocumentPreprocessXSLT() {

assertTrue(strProcessXSLT.equals(strProcessXSLTEtalon));
}

@Test
public void testTablesCellsCount() {
ClassLoader classLoader = getClass().getClassLoader();
String xml = classLoader.getResource("G.191.xml").getFile();
SourceXMLDocument sourceXMLDocument = new SourceXMLDocument(new File(xml));
Map<String,Integer> tablesCellsCount = sourceXMLDocument.getTablesCellsCountMap();
int countCells = sourceXMLDocument.getCountTableCells();
assertTrue(tablesCellsCount.size() == 27);
assertTrue(countCells == 725);
}
}

0 comments on commit 5a3d9d8

Please sign in to comment.