Skip to content

Commit 3991c5b

Browse files
Merge pull request #859 from TamimEhsan/NUTCH-3122
[NUTCH-3122] Make SpellCheckedMetadata case-insensitive for all Metadata names
2 parents 7e43e12 + 365f585 commit 3991c5b

File tree

2 files changed

+88
-2
lines changed

2 files changed

+88
-2
lines changed

src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,10 @@
2626
/**
2727
* A decorator to Metadata that adds spellchecking capabilities to property
2828
* names. Currently used spelling vocabulary contains just the HTTP headers from
29-
* {@link HttpHeaders} class.
29+
* {@link HttpHeaders} class. Other names are case insensitive.
3030
*
3131
*/
32-
public class SpellCheckedMetadata extends Metadata {
32+
public class SpellCheckedMetadata extends CaseInsensitiveMetadata {
3333

3434
/**
3535
* Threshold divider to calculate max. Levenshtein distance for misspelled

src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import java.io.DataInputStream;
2222
import java.io.DataOutputStream;
2323
import java.io.IOException;
24+
import java.util.HashMap;
2425
import java.util.Properties;
2526

2627
import org.junit.Assert;
@@ -111,6 +112,33 @@ public void testSet() {
111112
Assert.assertEquals("new value 2", values[1]);
112113
}
113114

115+
/** Test for the <code>set(String, String)</code> method. */
116+
@Test
117+
public void testSetCaseInsensitive() {
118+
String[] values = null;
119+
SpellCheckedMetadata meta = new SpellCheckedMetadata();
120+
121+
values = meta.getValues("name-one");
122+
Assert.assertEquals(0, values.length);
123+
124+
meta.set("name-one", "value1");
125+
values = meta.getValues("name-one");
126+
Assert.assertEquals(1, values.length);
127+
Assert.assertEquals("value1", values[0]);
128+
129+
meta.set("naMe-OnE", "value2");
130+
values = meta.getValues("name-one");
131+
Assert.assertEquals(1, values.length);
132+
Assert.assertEquals("value2", values[0]);
133+
134+
meta.set("nAme-One", "new value 1");
135+
meta.add("NamE-oNe", "new value 2");
136+
values = meta.getValues("namE-OnE");
137+
Assert.assertEquals(2, values.length);
138+
Assert.assertEquals("new value 1", values[0]);
139+
Assert.assertEquals("new value 2", values[1]);
140+
}
141+
114142
/** Test for <code>setAll(Properties)</code> method. */
115143
@Test
116144
public void testSetProperties() {
@@ -151,6 +179,19 @@ public void testGet() {
151179
Assert.assertEquals("value-1", meta.get("a-name"));
152180
}
153181

182+
/** Test for <code>get(String)</code> method. */
183+
@Test
184+
public void testGetCaseInsensitive() {
185+
SpellCheckedMetadata meta = new SpellCheckedMetadata();
186+
Assert.assertNull(meta.get("a-name"));
187+
188+
meta.add("a-name", "value-1");
189+
Assert.assertEquals("value-1", meta.get("a-name"));
190+
191+
Assert.assertNotNull(meta.get("a-NamE"));
192+
Assert.assertEquals("value-1", meta.get("a-NamE"));
193+
}
194+
154195
/** Test for <code>isMultiValued()</code> method. */
155196
@Test
156197
public void testIsMultiValued() {
@@ -252,6 +293,51 @@ public void testWritable() {
252293
Assert.assertEquals("text/html", result.get(Metadata.CONTENT_TYPE));
253294
}
254295

296+
/** Test for <code>Writable</code> implementation. */
297+
@Test
298+
public void testWritableBackwardCompatibility() {
299+
// Entries added with old instances of SpellCheckedMetadata
300+
// should be readable by new instances of SpellCheckedMetadata
301+
SpellCheckedMetadata result = null;
302+
CaseSensitiveSpellCheckedMetadata meta = new CaseSensitiveSpellCheckedMetadata();
303+
304+
result = writeRead(meta);
305+
Assert.assertEquals(0, result.size());
306+
307+
meta.add("name-One", "value-1.1");
308+
// Check that the original case is kept for old Metadata class
309+
Assert.assertEquals(0, result.getValues("naMe-one").length);
310+
311+
// Check that the values written by old instances can be
312+
// read by new instances of SpellCheckedMetadata
313+
result = writeRead(meta);
314+
Assert.assertEquals(1, result.size());
315+
Assert.assertEquals(1, result.getValues("naMe-one").length);
316+
Assert.assertEquals("value-1.1", result.get("nAme-oNe"));
317+
318+
meta.add("Contenttype", "text/html");
319+
meta.add("name-Two", "value-2.1");
320+
meta.add("namE-two", "value-2.2");
321+
result = writeRead(meta);
322+
Assert.assertEquals(3, result.size());
323+
Assert.assertEquals(1, result.getValues("name-onE").length);
324+
Assert.assertEquals("value-1.1", result.getValues("namE-one")[0]);
325+
Assert.assertEquals(2, result.getValues("name-two").length);
326+
Assert.assertEquals("value-2.1", result.getValues("nAme-tWo")[0]);
327+
Assert.assertEquals("value-2.2", result.getValues("namE-Two")[1]);
328+
Assert.assertEquals("text/html", result.get(Metadata.CONTENT_TYPE));
329+
}
330+
331+
/**
332+
* Mock class to test backward compatibility of SpellCheckedMetadata
333+
* after changing the internal storage to be case insensitive.
334+
*/
335+
private static class CaseSensitiveSpellCheckedMetadata extends SpellCheckedMetadata {
336+
public CaseSensitiveSpellCheckedMetadata() {
337+
metadata = new HashMap<>();
338+
}
339+
}
340+
255341
/**
256342
* IO Test method, usable only when you plan to do changes in metadata to
257343
* measure relative performance impact.

0 commit comments

Comments
 (0)