11package net .pdfix ;
22
3+ import java .io .File ;
4+ import java .io .FileNotFoundException ;
35import java .nio .channels .NonReadableChannelException ;
46import java .util .ArrayList ;
57import java .util .List ;
68
79import net .pdfix .pdfixlib .*;
810
911public class FindDuplicateMcid {
12+ private static int kReportTypeDplicateMcid = 1 ;
13+ private static int kReportTypeArtifactMcid = 2 ;
14+
1015 // Helper function to get a readable object type
1116 private static String getNiceObjType (PdfPageObjectType type ) {
1217 switch (type ) {
@@ -42,8 +47,13 @@ private static String getObjContent(PdsPageObject obj) {
4247 return info .toString ();
4348 }
4449
45- public static void reportMcid (int pageNum , PdsPageObject obj , int index , int mcid ) {
46- System .out .println ("Duplicate MCID Found:" );
50+ public static void reportMcid (int pageNum , PdsPageObject obj , int index , int mcid , int reportType ) {
51+ // report type
52+ if (reportType == kReportTypeDplicateMcid ) {
53+ System .out .println ("Error: Duplicate MCID found" );
54+ } else if (reportType == kReportTypeArtifactMcid ) {
55+ System .out .println ("Warning: Artifact with MCID found" );
56+ }
4757 String objType = getNiceObjType (obj .GetObjectType ());
4858 String objBBox = getObjBBox (obj );
4959 String objContent = getObjContent (obj );
@@ -63,11 +73,40 @@ public static void reportMcid(int pageNum, PdsPageObject obj, int index, int mci
6373 System .out .println (info .toString ());
6474 }
6575
76+ private static Boolean compareContentMarkMCID (PdsPageObject obj1 , PdsPageObject obj2 ) {
77+ if (obj1 == obj2 ) {
78+ return true ;
79+ }
80+ if ((obj1 == null ) || (obj2 == null )) {
81+ return false ;
82+ }
83+ PdsContentMark cm1 = obj1 .GetContentMark ();
84+ PdsContentMark cm2 = obj2 .GetContentMark ();
85+
86+ // compare content mark index with MCID
87+ if (cm1 .GetTagMcid () != cm2 .GetTagMcid ()) {
88+ return false ;
89+ }
90+
91+ // compare content mark names, manes on each index must me equal
92+ for (int i = 0 ; i <= cm1 .GetTagMcid (); i ++) {
93+ if (cm1 .GetTagName (i ).compareTo (cm2 .GetTagName (i )) != 0 ) {
94+ return false ;
95+ }
96+ }
97+ return true ;
98+ }
99+
66100 // Check for duplicate MCIDs in a PDF file. Return the number of dulicate mcids
67101 // found
68102 public static int checkDuplicateMcid (String path ) throws Exception {
69103 Pdfix pdfix = new Pdfix ();
70104
105+ File file = new File (path );
106+ if (!file .exists ()) {
107+ throw new FileNotFoundException (path );
108+ }
109+
71110 PdfDoc doc = pdfix .OpenDoc (path , "" );
72111 if (doc == null ) {
73112 throw new RuntimeException (pdfix .GetError ());
@@ -89,34 +128,37 @@ public static int checkDuplicateMcid(String path) throws Exception {
89128 }
90129
91130 int lastMcid = -1 ;
131+ PdsPageObject lastObject = null ;
92132 List <Integer > mcids = new ArrayList <Integer >();
93- PdsPageObject lastObj = null ;
94133 for (int j = 0 ; j < content .GetNumObjects (); j ++) {
95134 PdsPageObject obj = content .GetObject (j );
135+ PdsContentMark contentMark = obj .GetContentMark ();
96136 int mcid = obj .GetMcid ();
97- if ((mcid != -1 ) && (mcid == lastMcid )) {
98- // content marks must be equal for equal mcid
99- if (lastObj != null ) {
100- if (obj .GetNumEqualTags (lastObj ) != obj .GetContentMark ().GetNumTags ()) {
101- reportMcid (i , obj , j , mcid );
102- found ++;
103- }
104- }
105- } else if (mcid != lastMcid ) {
137+ Boolean isArtifact = (contentMark .GetTagArtifact () != -1 );
138+
139+ // reports following options:
140+ // Error: duplicite MCID in tagged content (second MCID occurence can be in tagged content or artifact)
141+ // Warning: MCID set for Artifact (it may be used in tag tree)
142+
143+ if ((mcid != lastMcid ) || ((mcid != -1 ) && (lastObject != null ) && (!compareContentMarkMCID (obj , lastObject )))) {
106144 lastMcid = mcid ;
107145 if (mcid == -1 ) {
108146 continue ;
109147 }
110-
111148 if (mcids .contains (mcid )) {
112- reportMcid (i , obj , j , mcid );
149+ reportMcid (i , obj , j , mcid , kReportTypeDplicateMcid );
113150 found ++;
114151 }
115152 mcids .add (mcid );
116153 }
117- lastObj = obj ;
154+ if (isArtifact && (mcid != -1 )) {
155+ if (mcid != -1 ) {
156+ reportMcid (i , obj , j , mcid , kReportTypeArtifactMcid );
157+ }
158+ lastMcid = -1 ;
159+ }
160+ lastObject = obj ;
118161 }
119-
120162 page .Release ();
121163 }
122164
0 commit comments