Skip to content

Commit 48e4ecc

Browse files
authored
strip charset from mimes (#2775)
1 parent 35ebe92 commit 48e4ecc

5 files changed

Lines changed: 35 additions & 28 deletions

File tree

tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ public abstract class ProfilerBase {
102102
public static TableInfo REF_PARSE_EXCEPTION_TYPES =
103103
new TableInfo("ref_parse_exception_types", new ColInfo(Cols.PARSE_EXCEPTION_ID, Types.INTEGER), new ColInfo(Cols.PARSE_EXCEPTION_DESCRIPTION, Types.VARCHAR, 128));
104104
public static TableInfo MIME_TABLE = new TableInfo("mimes", new ColInfo(Cols.MIME_ID, Types.INTEGER, "PRIMARY KEY"), new ColInfo(Cols.MIME_STRING, Types.VARCHAR, 256),
105+
new ColInfo(Cols.BASE_MIME, Types.VARCHAR, 256),
105106
new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12));
106107
private static CommonTokenCountManager COMMON_TOKEN_COUNT_MANAGER;
107108
private static Pattern FILE_NAME_CLEANER = Pattern.compile("\\.(json|txt)(\\.(bz2|gz|zip))?$");

tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ public enum Cols {
5151

5252

5353
MIME_STRING,//string representation of mime type
54+
BASE_MIME,//mime type without parameters (charset, delimiter, etc.)
5455

5556
DIR_NAME_A,//for comparisons in REF_PAIR_NAMES
5657
DIR_NAME_B,

tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/MimeBuffer.java

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,10 @@ public class MimeBuffer extends AbstractDBBuffer {
3636

3737
public MimeBuffer(Connection connection, TableInfo mimeTable, MimeTypes mimeTypes) throws SQLException {
3838
st = connection.prepareStatement(
39-
"insert into " + mimeTable.getName() + "( " + Cols.MIME_ID.name() + ", " + Cols.MIME_STRING.name() + ", " + Cols.FILE_EXTENSION.name() + ") values (?,?,?)");
39+
"insert into " + mimeTable.getName() + "( " +
40+
Cols.MIME_ID.name() + ", " + Cols.MIME_STRING.name() +
41+
", " + Cols.BASE_MIME.name() + ", " +
42+
Cols.FILE_EXTENSION.name() + ") values (?,?,?,?)");
4043
this.mimeTypes = mimeTypes;
4144
this.connection = connection;
4245
}
@@ -47,15 +50,17 @@ public void write(int id, String value) throws RuntimeException {
4750
st.clearParameters();
4851
st.setInt(1, id);
4952
st.setString(2, value);
53+
int semi = value.indexOf(';');
54+
st.setString(3, semi > 0 ? value.substring(0, semi).trim() : value);
5055
try {
5156
String ext = MimeUtil.getExtension(value, mimeTypes);
5257
if (ext == null || ext.isEmpty()) {
53-
st.setNull(3, Types.VARCHAR);
58+
st.setNull(4, Types.VARCHAR);
5459
} else {
55-
st.setString(3, ext);
60+
st.setString(4, ext);
5661
}
5762
} catch (MimeTypeException e) {
58-
st.setNull(3, Types.VARCHAR);
63+
st.setNull(4, Types.VARCHAR);
5964
}
6065
st.execute();
6166

tika-eval/tika-eval-app/src/main/resources/comparison-reports-pg.xml

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1510,12 +1510,12 @@
15101510
includeSql="true">
15111511

15121512
<sql>
1513-
select mime_string, count(1) as cnt
1513+
select base_mime, count(1) as cnt
15141514
from profiles_a pa
15151515
left join profiles_b pb on pa.id=pb.id
15161516
join mimes m on pa.mime_id=m.mime_id
15171517
where pb.id is null
1518-
group by mime_string
1518+
group by base_mime
15191519
order by cnt desc
15201520
</sql>
15211521
</report>
@@ -1525,12 +1525,12 @@
15251525
includeSql="true">
15261526

15271527
<sql>
1528-
select mime_string, count(1) as cnt
1528+
select base_mime, count(1) as cnt
15291529
from profiles_a pa
15301530
left join profiles_b pb on pa.id=pb.id
15311531
join mimes m on pa.mime_id=m.mime_id
15321532
where pb.id is null and pa.is_embedded=false
1533-
group by mime_string
1533+
group by base_mime
15341534
order by cnt desc
15351535
</sql>
15361536
</report>
@@ -1540,12 +1540,12 @@
15401540
includeSql="true">
15411541

15421542
<sql>
1543-
select mime_string, count(1) as cnt
1543+
select base_mime, count(1) as cnt
15441544
from profiles_a pa
15451545
left join profiles_b pb on pa.id=pb.id
15461546
join mimes m on pa.mime_id=m.mime_id
15471547
where pb.id is null and pa.is_embedded=true
1548-
group by mime_string
1548+
group by base_mime
15491549
order by cnt desc
15501550
</sql>
15511551
</report>
@@ -1555,12 +1555,12 @@
15551555
includeSql="true">
15561556

15571557
<sql>
1558-
select mime_string, count(1) as cnt
1558+
select base_mime, count(1) as cnt
15591559
from profiles_b pb
15601560
left join profiles_a pa on pb.id=pa.id
15611561
join mimes m on pb.mime_id=m.mime_id
15621562
where pa.id is null
1563-
group by mime_string
1563+
group by base_mime
15641564
order by cnt desc
15651565
</sql>
15661566
</report>
@@ -1570,12 +1570,12 @@
15701570
includeSql="true">
15711571

15721572
<sql>
1573-
select mime_string, count(1) as cnt
1573+
select base_mime, count(1) as cnt
15741574
from profiles_b pb
15751575
left join profiles_a pa on pb.id=pa.id
15761576
join mimes m on pb.mime_id=m.mime_id
15771577
where pa.id is null and pb.is_embedded=false
1578-
group by mime_string
1578+
group by base_mime
15791579
order by cnt desc
15801580
</sql>
15811581
</report>
@@ -1585,12 +1585,12 @@
15851585
includeSql="true">
15861586

15871587
<sql>
1588-
select mime_string, count(1) as cnt
1588+
select base_mime, count(1) as cnt
15891589
from profiles_b pb
15901590
left join profiles_a pa on pb.id=pa.id
15911591
join mimes m on pb.mime_id=m.mime_id
15921592
where pa.id is null and pb.is_embedded=true
1593-
group by mime_string
1593+
group by base_mime
15941594
order by cnt desc
15951595
</sql>
15961596
</report>

tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1318,12 +1318,12 @@
13181318
includeSql="true">
13191319

13201320
<sql>
1321-
select mime_string, count(1) as cnt
1321+
select base_mime, count(1) as cnt
13221322
from profiles_a pa
13231323
left join profiles_b pb on pa.id=pb.id
13241324
join mimes m on pa.mime_id=m.mime_id
13251325
where pb.id is null
1326-
group by mime_string
1326+
group by base_mime
13271327
order by cnt desc
13281328
</sql>
13291329
</report>
@@ -1333,12 +1333,12 @@
13331333
includeSql="true">
13341334

13351335
<sql>
1336-
select mime_string, count(1) as cnt
1336+
select base_mime, count(1) as cnt
13371337
from profiles_a pa
13381338
left join profiles_b pb on pa.id=pb.id
13391339
join mimes m on pa.mime_id=m.mime_id
13401340
where pb.id is null and pa.is_embedded=false
1341-
group by mime_string
1341+
group by base_mime
13421342
order by cnt desc
13431343
</sql>
13441344
</report>
@@ -1348,12 +1348,12 @@
13481348
includeSql="true">
13491349

13501350
<sql>
1351-
select mime_string, count(1) as cnt
1351+
select base_mime, count(1) as cnt
13521352
from profiles_a pa
13531353
left join profiles_b pb on pa.id=pb.id
13541354
join mimes m on pa.mime_id=m.mime_id
13551355
where pb.id is null and pa.is_embedded=true
1356-
group by mime_string
1356+
group by base_mime
13571357
order by cnt desc
13581358
</sql>
13591359
</report>
@@ -1363,12 +1363,12 @@
13631363
includeSql="true">
13641364

13651365
<sql>
1366-
select mime_string, count(1) as cnt
1366+
select base_mime, count(1) as cnt
13671367
from profiles_b pb
13681368
left join profiles_a pa on pb.id=pa.id
13691369
join mimes m on pb.mime_id=m.mime_id
13701370
where pa.id is null
1371-
group by mime_string
1371+
group by base_mime
13721372
order by cnt desc
13731373
</sql>
13741374
</report>
@@ -1378,12 +1378,12 @@
13781378
includeSql="true">
13791379

13801380
<sql>
1381-
select mime_string, count(1) as cnt
1381+
select base_mime, count(1) as cnt
13821382
from profiles_b pb
13831383
left join profiles_a pa on pb.id=pa.id
13841384
join mimes m on pb.mime_id=m.mime_id
13851385
where pa.id is null and pb.is_embedded=false
1386-
group by mime_string
1386+
group by base_mime
13871387
order by cnt desc
13881388
</sql>
13891389
</report>
@@ -1393,12 +1393,12 @@
13931393
includeSql="true">
13941394

13951395
<sql>
1396-
select mime_string, count(1) as cnt
1396+
select base_mime, count(1) as cnt
13971397
from profiles_b pb
13981398
left join profiles_a pa on pb.id=pa.id
13991399
join mimes m on pb.mime_id=m.mime_id
14001400
where pa.id is null and pb.is_embedded=true
1401-
group by mime_string
1401+
group by base_mime
14021402
order by cnt desc
14031403
</sql>
14041404
</report>

0 commit comments

Comments
 (0)