Extracting and parsing structured data with jQuery Selector, XPath or JsonPath from common web format like HTML, XML and JSON.
Extracting and parsing structured data with Jquery Selector, XPath or JsonPath from common web format like HTML, XML and JSON.
Implements:
To add a dependency on Web-Data-Extractor using Maven, use the following:
<dependency>
<groupId>im.nll.data</groupId>
<artifactId>extractor</artifactId>
<version>0.9.6</version>
</dependency>
To add a dependency using Gradle:
dependencies {
compile 'im.nll.data:extractor:0.9.6'
}
String followers = Extractors.on(baseHtml)
.extract(new SelectorExtractor("div.followers"))
.with(new RegexExtractor("\\d+"))
.asString();
or use static method
String followers = Extractors.on(baseHtml)
.extract(selector("div.followers"))
.with(regex("\\d+"))
.asString();
or short string
String followers = Extractors.on(baseHtml)
.extract("selector:div.followers"))
.with(regex("\\d+"))
.asString();
more method
String year = Extractors.on("<div> Talk is cheap. Show me the code. - Fri, 25 Aug 2000 </div>")
.extract(selector("div")) // extract with selector
.filter(value -> value.trim()) // trim result
.with(regex("20\\d{2}")) // get year with regex
.filter(value -> "from " + value) // append 'from' string
.asString();
Assert.assertEquals("from 2000", year);
@Test
public void testToMap() throws Exception {
Map<String, String> dataMap = Extractors.on(baseHtml)
.extract("title", selector("a.title"))
.extract("followers", selector("div.followers")).with(regex("\\d+"))
.extract("description", selector("div.description"))
.asMap();
Assert.assertEquals("fivesmallq", dataMap.get("title"));
Assert.assertEquals("29671", dataMap.get("followers"));
Assert.assertEquals("Talk is cheap. Show me the code.", dataMap.get("description"));
}
@Test
public void testToMapList() throws Exception {
//split param must implements ListableExtractor
List<Map<String, String>> languages = Extractors.on(listHtml)
.split(selector("tr.item.html"))
.extract("type", selector("td.type"))
.extract("name", selector("td.name"))
.extract("url", selector("td.url"))
.asMapList();
Assert.assertNotNull(languages);
Map<String, String> second = languages.get(1);
Assert.assertEquals(languages.size(), 3);
Assert.assertEquals(second.get("type"), "dynamic");
Assert.assertEquals(second.get("name"), "Ruby");
Assert.assertEquals(second.get("url"), "https://www.ruby-lang.org");
}
@Test
public void testToBean() throws Exception {
Base base = Extractors.on(baseHtml)
.extract("title", selector("a.title"))
.extract("followers", selector("div.followers")).with(regex("\\d+"))
.extract("description", selector("div.description"))
.asBean(Base.class);
Assert.assertEquals("fivesmallq", base.getTitle());
Assert.assertEquals("29671", base.getFollowers());
Assert.assertEquals("Talk is cheap. Show me the code.", base.getDescription());
}
@Test
public void testToBeanList() throws Exception {
List<Language> languages = Extractors.on(listHtml)
.split(selector("tr.item.html"))
.extract("type", selector("td.type"))
.extract("name", selector("td.name"))
.extract("url", selector("td.url"))
.asBeanList(Language.class);
Assert.assertNotNull(languages);
Language second = languages.get(1);
Assert.assertEquals(languages.size(), 3);
Assert.assertEquals(second.getType(), "dynamic");
Assert.assertEquals(second.getName(), "Ruby");
Assert.assertEquals(second.getUrl(), "https://www.ruby-lang.org");
}
set embeddable field value by embeddable.fieldName
@Test
public void testEmbeddable() {
List<Activity> activities = Extractors.on(base5Xml)
.split(xpath("//ProcessDefinition/activity").removeNamespace())
.extract("name", xpath("//activity/@name"))
.extract("type", xpath("//activity/type/text()"))
.extract("resourceType", xpath("//activity/resourceType/text()"))
.extract("config.encoding", xpath("//activity/config/encoding/text()"))
.extract("config.pollInterval", xpath("//activity/config/pollInterval/text()"))
//if pollInterval is null set to default '5'
.filter(value -> value == null ? value : "5")
.extract("config.compressFile", xpath("//activity/config/compressFile/text()"))
.extract("inputBindings.fileName", xpath("//activity/inputBindings/WriteActivityInputTextClass/fileName/value-of/@select"))
.extract("inputBindings.textContent", xpath("//activity/inputBindings/WriteActivityInputTextClass/textContent/value-of/@select"))
.asBeanList(Activity.class);
Assert.assertNotNull(activities);
Assert.assertEquals(1, activities.size());
Activity activity = activities.get(0);
Assert.assertEquals("Output1", activity.getName());
Assert.assertEquals("com.tibco.plugin.file.FileWriteActivity", activity.getType());
//config
Config config = activity.getConfig();
Assert.assertEquals("text", config.getEncoding());
Assert.assertEquals("None", config.getCompressFile());
Assert.assertEquals("5", config.getPollInterval());
//bind
BindingSpec bindingSpec = activity.getInputBindings();
Assert.assertEquals("$_globalVariables/ns:GlobalVariables/GlobalVariables/OutputLocation", bindingSpec.getFileName());
Assert.assertEquals("$File-Poller/pfx:EventSourceOuputTextClass/fileContent/textContent", bindingSpec.getTextContent());
}
before
and after
is the global filter.
@Test
public void testToBeanListFilterBeforeAndAfter() throws Exception {
List<Language> languages = Extractors.on(listHtml)
//before and after just process the extract value, then execute the follow filter method.
.before(value -> "|before|" + value)
.after(value -> value + "|after|")
.split(xpath("//tr[@class='item']"))
.extract("type", xpath("//td[1]/text()")).filter(value -> "filter:" + value)
.extract("name", xpath("//td[2]/text()")).filter(value -> "filter:" + value)
.extract("url", xpath("//td[3]/text()")).filter(value -> "filter:" + value)
.asBeanList(Language.class);
Assert.assertNotNull(languages);
Language second = languages.get(1);
Assert.assertEquals(languages.size(), 3);
Assert.assertEquals(second.getType(), "filter:|before|dynamic|after|");
Assert.assertEquals(second.getName(), "filter:|before|Ruby|after|");
Assert.assertEquals(second.getUrl(), "filter:|before|https://www.ruby-lang.org|after|");
}
see Example
Bug reports and pull requests are welcome on GitHub at https://github.com/fivesmallq/web-data-extractor.