-
Notifications
You must be signed in to change notification settings - Fork 134
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'aecio/issue-74' into master (issue #74)
- Loading branch information
Showing
7 changed files
with
302 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
214 changes: 214 additions & 0 deletions
214
src/main/java/focusedCrawler/target/repository/ElasticSearchRestTargetRepository.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,214 @@ | ||
package focusedCrawler.target.repository; | ||
|
||
import java.io.IOException; | ||
import java.io.UnsupportedEncodingException; | ||
import java.net.MalformedURLException; | ||
import java.net.URL; | ||
import java.net.URLEncoder; | ||
import java.util.ArrayList; | ||
import java.util.Arrays; | ||
import java.util.Collections; | ||
import java.util.List; | ||
import java.util.Map; | ||
|
||
import org.apache.http.HttpHost; | ||
import org.apache.http.client.config.RequestConfig; | ||
import org.apache.http.entity.AbstractHttpEntity; | ||
import org.apache.http.entity.ContentType; | ||
import org.apache.http.nio.entity.NStringEntity; | ||
import org.apache.http.util.EntityUtils; | ||
import org.elasticsearch.client.Response; | ||
import org.elasticsearch.client.RestClient; | ||
import org.elasticsearch.client.RestClientBuilder; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import com.fasterxml.jackson.core.JsonProcessingException; | ||
import com.fasterxml.jackson.databind.ObjectMapper; | ||
import com.fasterxml.jackson.databind.SerializationFeature; | ||
|
||
import focusedCrawler.target.model.Page; | ||
import focusedCrawler.target.model.TargetModelElasticSearch; | ||
import focusedCrawler.target.repository.elasticsearch.ElasticSearchConfig; | ||
|
||
public class ElasticSearchRestTargetRepository implements TargetRepository { | ||
|
||
private static final Map<String, String> EMPTY_MAP = Collections.<String, String>emptyMap(); | ||
private static final Logger logger = LoggerFactory.getLogger(ElasticSearchRestTargetRepository.class); | ||
private static final ObjectMapper mapper = new ObjectMapper(); | ||
|
||
static { | ||
mapper.disable(SerializationFeature.WRITE_DATES_AS_TIMESTAMPS); | ||
} | ||
|
||
private RestClient client; | ||
private String typeName; | ||
private String indexName; | ||
|
||
public ElasticSearchRestTargetRepository(ElasticSearchConfig config, | ||
String indexName, | ||
String typeName) { | ||
this.indexName = indexName; | ||
this.typeName = typeName; | ||
this.client = createRestClient(config); | ||
this.createIndexMapping(indexName); | ||
} | ||
|
||
private void createIndexMapping(String indexName) { | ||
|
||
String indexEndpoint = "/" + indexName; | ||
boolean exists = false; | ||
String esVersion = "5.x.x"; | ||
try { | ||
Response existsResponse = client.performRequest("HEAD", indexEndpoint); | ||
exists = (existsResponse.getStatusLine().getStatusCode() == 200); | ||
|
||
Response rootResponse = client.performRequest("GET", "/"); | ||
String json = EntityUtils.toString(rootResponse.getEntity()); | ||
String versionNumber = mapper.readTree(json).path("version").path("number").asText(); | ||
if (versionNumber != null && !versionNumber.isEmpty()) { | ||
esVersion = versionNumber; | ||
} | ||
logger.info("Elasticsearch version: {}", esVersion); | ||
} catch (IOException e) { | ||
throw new RuntimeException( | ||
"Failed to check whether index already exists in Elasticsearch.", e); | ||
} | ||
|
||
if (!exists) { | ||
final String targetMapping1x = "" | ||
+ "{" | ||
+ " \"properties\": {" | ||
+ " \"domain\": {\"type\": \"string\",\"index\": \"not_analyzed\"}," | ||
+ " \"words\": {\"type\": \"string\",\"index\": \"not_analyzed\"}," | ||
+ " \"wordsMeta\": {\"type\": \"string\",\"index\": \"not_analyzed\"}," | ||
+ " \"retrieved\": {\"type\": \"date\",\"format\": \"dateOptionalTime\"}," | ||
+ " \"text\": {\"type\": \"string\"}," | ||
+ " \"title\": {\"type\": \"string\"}," | ||
+ " \"url\": {\"type\": \"string\",\"index\": \"not_analyzed\"}," | ||
+ " \"topPrivateDomain\": {\"type\": \"string\",\"index\": \"not_analyzed\"}" | ||
+ " }" | ||
+ "}"; | ||
|
||
final String pageMapping5x ="" | ||
+ "{" | ||
+ " \"properties\": {" | ||
+ " \"domain\": {\"type\": \"keyword\",\"index\": true}," | ||
+ " \"words\": {\"type\": \"keyword\",\"index\": true}," | ||
+ " \"wordsMeta\": {\"type\": \"keyword\",\"index\": true}," | ||
+ " \"retrieved\": {\"type\": \"date\",\"format\": \"dateOptionalTime\"}," | ||
+ " \"text\": {\"type\": \"text\"}," | ||
+ " \"title\": {\"type\": \"text\"}," | ||
+ " \"url\": {\"type\": \"keyword\",\"index\":true}," | ||
+ " \"topPrivateDomain\": {\"type\": \"keyword\",\"index\": true}" | ||
+ " }" | ||
+ "}"; | ||
|
||
String pageProperties = esVersion.startsWith("5.") ? pageMapping5x : targetMapping1x; | ||
|
||
String mapping = | ||
"{" | ||
+ " \"mappings\": {" | ||
+ " \"target\": "+ pageProperties + "," | ||
+ " \"negative\": "+ pageProperties | ||
+ " }" | ||
+ "}"; | ||
|
||
try { | ||
AbstractHttpEntity entity = createJsonEntity(mapping); | ||
Response response = client.performRequest("PUT", indexEndpoint, EMPTY_MAP, entity); | ||
if (response.getStatusLine().getStatusCode() != 200) { | ||
throw new RuntimeException( | ||
"Failed to create index in Elasticsearch." + response.toString()); | ||
} | ||
} catch (IOException e) { | ||
throw new RuntimeException("Failed to create index in Elasticsearch.", e); | ||
} | ||
} | ||
} | ||
|
||
private AbstractHttpEntity createJsonEntity(String mapping) { | ||
return new NStringEntity(mapping, ContentType.APPLICATION_JSON); | ||
} | ||
|
||
public boolean insert(Page target) { | ||
return index(target); | ||
} | ||
|
||
private boolean index(Page page) { | ||
|
||
TargetModelElasticSearch data = new TargetModelElasticSearch(page); | ||
|
||
String docId = encodeUrl(page.getURL().toString()); | ||
String endpoint = "/" + indexName + "/" + typeName + "/" + docId; | ||
AbstractHttpEntity entity = createJsonEntity(serializeAsJson(data)); | ||
try { | ||
Response response = client.performRequest("PUT", endpoint, EMPTY_MAP, entity); | ||
return response.getStatusLine().getStatusCode() == 201; | ||
} catch (IOException e) { | ||
throw new RuntimeException("Failed to index page.", e); | ||
} | ||
} | ||
|
||
private String encodeUrl(String url) { | ||
try { | ||
return URLEncoder.encode(url, "UTF-8"); | ||
} catch (UnsupportedEncodingException e) { | ||
throw new IllegalStateException("Failed to URL encode string: "+url, e); | ||
} | ||
} | ||
|
||
private String serializeAsJson(Object model) { | ||
String targetAsJson; | ||
try { | ||
targetAsJson = mapper.writeValueAsString(model); | ||
} catch (JsonProcessingException e) { | ||
throw new RuntimeException("Failed to serialize TargetModel to JSON.", e); | ||
} | ||
return targetAsJson; | ||
} | ||
|
||
public RestClient createRestClient(ElasticSearchConfig config) { | ||
|
||
List<String> esHosts = config.getRestApiHosts(); | ||
List<HttpHost> hosts = new ArrayList<>(); | ||
for (String host : esHosts) { | ||
try { | ||
URL url = new URL(host); | ||
hosts.add(new HttpHost(url.getHost(), url.getPort())); | ||
} catch (MalformedURLException e) { | ||
throw new RuntimeException("Failed to initialize Elasticsearch REST client. " | ||
+ "Invalid host: " + host, e); | ||
} | ||
} | ||
|
||
HttpHost[] httpHostsArray = (HttpHost[]) hosts.toArray(new HttpHost[hosts.size()]); | ||
|
||
client = RestClient.builder(httpHostsArray) | ||
.setRequestConfigCallback(new RestClientBuilder.RequestConfigCallback() { | ||
@Override | ||
public RequestConfig.Builder customizeRequestConfig(RequestConfig.Builder requestConfigBuilder) { | ||
return requestConfigBuilder | ||
.setConnectTimeout(config.getRestConnectTimeout()) | ||
.setSocketTimeout(config.getRestSocketTimeout()); | ||
} | ||
}) | ||
.setMaxRetryTimeoutMillis(config.getRestMaxRetryTimeoutMillis()) | ||
.build(); | ||
|
||
logger.info("Initialized Elasticsearch REST client for hosts: "+Arrays.toString(httpHostsArray)); | ||
return client; | ||
} | ||
|
||
@Override | ||
public void close() { | ||
try { | ||
if (client != null) { | ||
client.close(); | ||
} | ||
} catch (IOException e) { | ||
throw new RuntimeException("Failed to close Elasticsearch REST client", e); | ||
} | ||
} | ||
|
||
} |
Oops, something went wrong.