Skip to content

Commit

Permalink
Merge pull request #132 from asidorowicz/master
Browse files Browse the repository at this point in the history
Improve SmartPdfCopy compression and performance
  • Loading branch information
VahidN authored Nov 27, 2023
2 parents 353d979 + e2b2fd5 commit f7087e3
Show file tree
Hide file tree
Showing 3 changed files with 112 additions and 43 deletions.
91 changes: 75 additions & 16 deletions src/iTextSharp.LGPLv2.Core.FunctionalTests/PdfSmartCopyTests.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
using System.Collections.Generic;
using System.IO;
using System.Linq;
using iTextSharp.text;
using iTextSharp.text.pdf;
using Microsoft.VisualStudio.TestTools.UnitTesting;
Expand All @@ -9,7 +11,7 @@ namespace iTextSharp.LGPLv2.Core.FunctionalTests;
public class PdfSmartCopyTests
{
[TestMethod]
public void Verify_Remove_Duplicate_Objects_Works()
public void Verify_Remove_Duplicate_Streams_Works()
{
var inputFile = CreateALargePdfFile();
var outFile = TestUtils.GetOutputFileName();
Expand All @@ -20,25 +22,29 @@ public void Verify_Remove_Duplicate_Objects_Works()
Assert.IsTrue(new FileInfo(inputFile).Length > new FileInfo(outFile).Length);
}

private static void CompressPdfFileRemoveDuplicateObjects(string inputFile, string outFile)
[TestMethod]
public void Verify_Remove_Duplicate_Dictionaries_Works()
{
using var fileStream = new FileStream(outFile, FileMode.Create);
using var pdfDoc = new Document();
var pdfSmartCopy = new PdfSmartCopy(pdfDoc, fileStream);
pdfSmartCopy.SetFullCompression();
var inputFile = CreatePdfFileWithEmbeddedFont();
var outFile = TestUtils.GetOutputFileName();

pdfDoc.AddAuthor(TestUtils.Author);
pdfDoc.Open();
CompressMultiplePdfFilesRemoveDuplicateObjects(inputFile, outFile);

using var reader = new PdfReader(inputFile);
TestUtils.VerifyPdfFileIsReadable(outFile);

var n = reader.NumberOfPages;
for (var page = 0; page < n;)
{
pdfSmartCopy.AddPage(pdfSmartCopy.GetImportedPage(reader, ++page));
}
using var reader = new PdfReader(outFile);
var fontCount = GetPdfObjects(reader)
.OfType<PdfDictionary>()
.Select(d => d.GetDirectObject(PdfName.TYPE))
.Where(PdfName.Fontdescriptor.Equals)
.Count();

pdfSmartCopy.FreeReader(reader);
Assert.AreEqual(1, fontCount);
}

private static void CompressPdfFileRemoveDuplicateObjects(string inputFile, string outFile)
{
CompressMultiplePdfFilesRemoveDuplicateObjects(inputFile, outFile, 1);
}

private string CreateALargePdfFile()
Expand All @@ -48,7 +54,7 @@ private string CreateALargePdfFile()
{
using (var pdfDoc = new Document(PageSize.A4))
{
var pdfWriter = PdfWriter.GetInstance(pdfDoc, fileStream);
PdfWriter.GetInstance(pdfDoc, fileStream);

pdfDoc.AddAuthor(TestUtils.Author);
pdfDoc.Open();
Expand All @@ -66,4 +72,57 @@ private string CreateALargePdfFile()
TestUtils.VerifyPdfFileIsReadable(pdfFilePath);
return pdfFilePath;
}

private string CreatePdfFileWithEmbeddedFont()
{
var pdfFilePath = TestUtils.GetOutputFileName();
using (var fileStream = new FileStream(pdfFilePath, FileMode.Create))
{
using (var pdfDoc = new Document(PageSize.A4))
{
PdfWriter.GetInstance(pdfDoc, fileStream);
pdfDoc.AddAuthor(TestUtils.Author);
pdfDoc.Open();

var font = TestUtils.GetUnicodeFont("Tahoma", TestUtils.GetTahomaFontPath(), 10, Font.NORMAL, BaseColor.Black);
pdfDoc.Add(new Paragraph("Document with embedded font", font));
}
}

TestUtils.VerifyPdfFileIsReadable(pdfFilePath);
return pdfFilePath;
}


private static void CompressMultiplePdfFilesRemoveDuplicateObjects(string inputFile, string outFile, int times = 10)
{
using var fileStream = new FileStream(outFile, FileMode.Create);
using var pdfDoc = new Document();
var pdfSmartCopy = new PdfSmartCopy(pdfDoc, fileStream);
pdfSmartCopy.SetFullCompression();

pdfDoc.AddAuthor(TestUtils.Author);
pdfDoc.Open();

// The same document has been added multiple times
// This will cause duplicate dictionaries (ex: FontDescriptors)
for (var i = 0; i < times; ++i)
{
using var reader = new PdfReader(inputFile);

var n = reader.NumberOfPages;
for (var page = 0; page < n;)
{
pdfSmartCopy.AddPage(pdfSmartCopy.GetImportedPage(reader, ++page));
}

pdfSmartCopy.FreeReader(reader);
}
}

private IEnumerable<object> GetPdfObjects(PdfReader reader)
{
for (var idx = 0; idx < reader.XrefSize; ++idx)
yield return reader.GetPdfObjectRelease(idx);
}
}
2 changes: 1 addition & 1 deletion src/iTextSharp.LGPLv2.Core/iTextSharp/text/pdf/PdfCopy.cs
Original file line number Diff line number Diff line change
Expand Up @@ -937,7 +937,7 @@ public override bool Equals(object obj)
return false;
}

return Gen == other.Gen && Num == other.Num;
return Num == other.Num && Gen == other.Gen;
}

public override int GetHashCode() => (Gen << 16) + Num;
Expand Down
62 changes: 36 additions & 26 deletions src/iTextSharp.LGPLv2.Core/iTextSharp/text/pdf/PdfSmartCopy.cs
Original file line number Diff line number Diff line change
Expand Up @@ -47,20 +47,6 @@ protected override PdfIndirectReference CopyIndirect(PrIndirectReference inp)
return null;
}

ByteStore streamKey = null;
var validStream = false;

if (srcObj.IsStream())
{
streamKey = new ByteStore(srcObj);
validStream = true;
var streamRef = _streamMap[streamKey];
if (streamRef != null)
{
return streamRef;
}
}

PdfIndirectReference theRef;
var key = new RefKey(inp);
var iRef = Indirects[key];
Expand All @@ -74,9 +60,25 @@ protected override PdfIndirectReference CopyIndirect(PrIndirectReference inp)
}
else
{
ByteStore streamKey = null;
if (srcObj.IsStream() || srcObj.IsDictionary())
{
streamKey = new ByteStore(srcObj);
var streamRef = _streamMap[streamKey];
if (streamRef != null)
{
return streamRef;
}
}

theRef = Body.PdfIndirectReference;
iRef = new IndirectReferences(theRef);
Indirects[key] = iRef;

if (streamKey != null)
{
_streamMap[streamKey] = theRef;
}
}

if (srcObj.IsDictionary())
Expand All @@ -90,11 +92,6 @@ protected override PdfIndirectReference CopyIndirect(PrIndirectReference inp)

iRef.SetCopied();

if (validStream)
{
_streamMap[streamKey] = theRef;
}

var obj = CopyObject(srcObj);
AddToBody(obj, theRef);
return theRef;
Expand All @@ -103,10 +100,12 @@ protected override PdfIndirectReference CopyIndirect(PrIndirectReference inp)
internal class ByteStore
{
private readonly byte[] _b;
private List<RefKey> _references;

internal ByteStore(PdfObject str)
{
var bb = new ByteBuffer();
_references = new List<RefKey>();
var level = 100;
serObject(str, level, bb);
_b = bb.ToByteArray();
Expand Down Expand Up @@ -199,18 +198,29 @@ private void serObject(PdfObject obj, int level, ByteBuffer bb)
return;
}

if (obj.IsIndirect())
{
var refKey = new RefKey((PdfIndirectReference)obj);
var refIdx = _references.IndexOf(refKey);
if (refIdx >= 0)
{
// Already seen, print relative reference label only
bb.Append($"$R{refIdx}");
return;
}

// First occurence, print relative reference label and process content
bb.Append($"$R{_references.Count}");
_references.Add(refKey);
}

obj = PdfReader.GetPdfObject(obj);
if (obj.IsStream())
{
bb.Append("$B");
serDic((PdfDictionary)obj, level - 1, bb);
if (level > 0)
{
using (var md5 = MD5BouncyCastle.Create())
{
bb.Append(md5.ComputeHash(PdfReader.GetStreamBytesRaw((PrStream)obj)));
}
}
using var md5 = MD5BouncyCastle.Create();
bb.Append(md5.ComputeHash(PdfReader.GetStreamBytesRaw((PrStream)obj)));
}
else if (obj.IsDictionary())
{
Expand Down

0 comments on commit f7087e3

Please sign in to comment.