TownCrawler partial

This commit is contained in:
Fabio Scotto di Santolo
2017-01-12 16:57:14 +01:00
parent dae449d27d
commit c48957ef57
24 changed files with 8404 additions and 332 deletions

2
.gitignore vendored
View File

@@ -1 +1 @@
/target/ /target

View File

@@ -0,0 +1,7 @@
<root>
<facet id="jpt.jaxb">
<node name="classgen">
<attribute name="package" value="it.noah.crawler"/>
</node>
</facet>
</root>

20
sql/regions.sql Normal file
View File

@@ -0,0 +1,20 @@
insert into regioni (nome) values ("Valle d\'Aosta");
insert into regioni (nome) values ('Piemonte');
insert into regioni (nome) values ('Liguria');
insert into regioni (nome) values ('Lombardia');
insert into regioni (nome) values ('Trentino-Alto Adige');
insert into regioni (nome) values ('Veneto');
insert into regioni (nome) values ('Friuli-Venezia Giulia');
insert into regioni (nome) values ('Emilia-Romagna');
insert into regioni (nome) values ('Toscana');
insert into regioni (nome) values ('Marche');
insert into regioni (nome) values ('Umbria');
insert into regioni (nome) values ('Lazio');
insert into regioni (nome) values ('Abruzzo');
insert into regioni (nome) values ('Molise');
insert into regioni (nome) values ('Campania');
insert into regioni (nome) values ('Basilicata');
insert into regioni (nome) values ('Puglia');
insert into regioni (nome) values ('Calabria');
insert into regioni (nome) values ('Sicilia');
insert into regioni (nome) values ('Sardegna');

7983
sql/towns.csv Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -1,11 +0,0 @@
package it.noah.crawler.converter;
import java.util.List;
import it.noah.crawler.dom.tag.Table;
public interface Converter<T> {
public List<T> convertTable(Table table);
}

View File

@@ -0,0 +1,41 @@
package it.noah.crawler.converter;
import java.util.ArrayList;
import java.util.List;
import it.noah.crawler.dom.tag.Cell;
import it.noah.crawler.dom.tag.Table;
import it.noah.crawler.dom.tag.TableRow;
import it.noah.crawler.model.District;
public class DistrictConverter {
public List<District> convertTable(Table table) {
List<District> province = new ArrayList<District>();
for (TableRow row : table.getRows()) {
District district = getDistrict(row);
province.add(district);
}
return province;
}
private District getDistrict(TableRow row) {
District district = new District();
for (int i = 0; i < row.getRow().size(); i++) {
Cell cell = row.getRow().get(i);
switch (i) {
case 0:
district.setInitial(cell.getValue());
break;
case 1:
district.setName(cell.getValue());
break;
case 2:
district.setRegion(cell.getValue());
break;
}
}
return district;
}
}

View File

@@ -0,0 +1,23 @@
package it.noah.crawler.converter;
import it.noah.crawler.model.District;
import it.noah.crawler.model.Town;
import it.noah.crawler.services.NoahPostalCodeSeeker;
public class TownConverter {
public Town convertCSVRowToTown(String[] row) {
Town town = new Town();
town.setName(row[0]);
town.setRegion(row[1]);
String districtStr = !"-".equals(row[2]) ? row[2] : row[3];
District district = new District();
district.setName(districtStr);
town.setDistrict(district);
String postalCode = NoahPostalCodeSeeker
.getPostalCodeFromName("morgoth", town.getName(), "IT");
town.setPostalCode(postalCode);
return town;
}
}

View File

@@ -1,23 +0,0 @@
package it.noah.crawler.converter.impl;
import java.util.List;
import org.geonames.PostalCode;
import it.noah.crawler.converter.Converter;
import it.noah.crawler.dom.tag.Table;
import it.noah.crawler.model.Comune;
public class ComuneConverter implements Converter<Comune> {
@Override
public List<Comune> convertTable(Table table) {
return null;
}
public List<Comune> convertPostalCode(List<PostalCode> postalCodes) {
// TODO da implementare
return null;
}
}

View File

@@ -1,43 +0,0 @@
package it.noah.crawler.converter.impl;
import java.util.ArrayList;
import java.util.List;
import it.noah.crawler.converter.Converter;
import it.noah.crawler.dom.tag.Cell;
import it.noah.crawler.dom.tag.Table;
import it.noah.crawler.dom.tag.TableRow;
import it.noah.crawler.model.Provincia;
public class ProvinciaConverter implements Converter<Provincia> {
@Override
public List<Provincia> convertTable(Table table) {
List<Provincia> province = new ArrayList<Provincia>();
for (TableRow row : table.getRows()) {
Provincia provincia = getProvincia(row);
province.add(provincia);
}
return province;
}
private Provincia getProvincia(TableRow row) {
Provincia provincia = new Provincia();
for (int i = 0; i < row.getRow().size(); i++) {
Cell cell = row.getRow().get(i);
switch (i) {
case 0:
provincia.setSigla(cell.getValue());
break;
case 1:
provincia.setNome(cell.getValue());
break;
case 2:
provincia.setRegione(cell.getValue());
break;
}
}
return provincia;
}
}

View File

@@ -1,9 +1,5 @@
package it.noah.crawler.dom; package it.noah.crawler.dom;
import it.noah.crawler.dom.tag.Table;
import it.noah.crawler.dom.tag.builder.TableBuilder;
import it.noah.crawler.exception.ObjectNotFoundException;
import java.io.IOException; import java.io.IOException;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
@@ -11,6 +7,10 @@ import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.jsoup.select.Elements; import org.jsoup.select.Elements;
import it.noah.crawler.dom.tag.Table;
import it.noah.crawler.dom.tag.builder.TableBuilder;
import it.noah.crawler.exception.ObjectNotFoundException;
public class NoahDOMExplorer { public class NoahDOMExplorer {
private Document document; private Document document;
@@ -26,8 +26,7 @@ public class NoahDOMExplorer {
public synchronized Table getTable(String tableName, boolean header) public synchronized Table getTable(String tableName, boolean header)
throws ObjectNotFoundException { throws ObjectNotFoundException {
if (document == null) { if (document == null) {
throw new ObjectNotFoundException( throw new ObjectNotFoundException("Document not found!!!");
"Documento non presente in sessione");
} }
Elements elements = document.getElementsByTag("table"); Elements elements = document.getElementsByTag("table");
Element element = selectByName(elements, tableName); Element element = selectByName(elements, tableName);
@@ -38,7 +37,7 @@ public class NoahDOMExplorer {
private synchronized Element selectByName(Elements elements, String name) { private synchronized Element selectByName(Elements elements, String name) {
Element selected = null; Element selected = null;
if (name == null || elements == null) { if (name == null || elements == null) {
throw new IllegalArgumentException("Parametri non validi!!!"); throw new IllegalArgumentException("Parameter not valid!!!");
} }
for (Element current : elements) { for (Element current : elements) {
if (name.equals(current.className())) { if (name.equals(current.className())) {

View File

@@ -1,5 +1,5 @@
package it.noah.crawler.enums; package it.noah.crawler.enums;
public enum CrawlerEnum { public enum CrawlerEnum {
PROVINCIA_CRAWLER, COMUNE_CRAWLER DISTRICT_CRAWLER, TOWN_CRAWLER
} }

View File

@@ -1,16 +0,0 @@
package it.noah.crawler.enums;
public enum UrlEnum {
PROVINCE_URL(
"http://www.aci.it/i-servizi/normative/codice-della-strada/elenco-sigle-province-ditalia.html");
private String url;
UrlEnum(String url) {
this.url = url;
}
public String getUrl() {
return this.url;
}
}

View File

@@ -2,18 +2,18 @@ package it.noah.crawler.factory;
import it.noah.crawler.NoahCrawler; import it.noah.crawler.NoahCrawler;
import it.noah.crawler.enums.CrawlerEnum; import it.noah.crawler.enums.CrawlerEnum;
import it.noah.crawler.impl.ComuneNoahCrawler; import it.noah.crawler.impl.NoahTownCrawler;
import it.noah.crawler.impl.ProvinciaNoahCrawler; import it.noah.crawler.impl.NoahDistrictCrawler;
import java.io.IOException; import java.io.IOException;
public class NoahCrawlerFactory { public class NoahCrawlerFactory {
public static NoahCrawler getInstance(CrawlerEnum crawler) throws IOException { public static NoahCrawler getInstance(CrawlerEnum crawler) throws IOException {
if (crawler == CrawlerEnum.PROVINCIA_CRAWLER) { if (crawler == CrawlerEnum.DISTRICT_CRAWLER) {
return new ProvinciaNoahCrawler(); return new NoahDistrictCrawler();
} else if (crawler == CrawlerEnum.COMUNE_CRAWLER) { } else if (crawler == CrawlerEnum.TOWN_CRAWLER) {
return new ComuneNoahCrawler(); return new NoahTownCrawler();
} }
return null; return null;
} }

View File

@@ -1,40 +0,0 @@
package it.noah.crawler.impl;
import org.geonames.Toponym;
import org.geonames.ToponymSearchCriteria;
import org.geonames.ToponymSearchResult;
import org.geonames.WebService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import it.noah.crawler.NoahCrawler;
import it.noah.crawler.exception.NoahCrawlerException;
public class ComuneNoahCrawler implements NoahCrawler {
private static final Logger log = LoggerFactory
.getLogger(ComuneNoahCrawler.class);
// http://api.geonames.org/postalCodeSearch?placename=IT&username=morgoth
@Override
public void run() throws NoahCrawlerException {
try {
WebService.setUserName("morgoth");
ToponymSearchCriteria criteria = new ToponymSearchCriteria();
criteria.setLanguage("IT");
criteria.setCountryCode("IT");
criteria.setMaxRows(1000);
ToponymSearchResult result = WebService.search(criteria);
System.out.println(result.getTotalResultsCount());
for (Toponym toponym : result.getToponyms()) {
// TODO FINIRE IMPLEMENTAZIONE
}
// TODO salvare sul database
} catch (Exception e) {
log.error(e.getMessage());
}
}
}

View File

@@ -0,0 +1,52 @@
package it.noah.crawler.impl;
import java.io.IOException;
import java.sql.SQLException;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import it.noah.crawler.AbstractNoahCrawler;
import it.noah.crawler.NoahCrawler;
import it.noah.crawler.converter.DistrictConverter;
import it.noah.crawler.dom.tag.Table;
import it.noah.crawler.exception.NoahCrawlerException;
import it.noah.crawler.exception.ObjectNotFoundException;
import it.noah.crawler.model.District;
import it.noah.crawler.persistence.DistrictAccess;
public class NoahDistrictCrawler extends AbstractNoahCrawler
implements NoahCrawler {
private final static Logger log = LoggerFactory
.getLogger(NoahDistrictCrawler.class);
private final static String URL = "http://www.aci.it/i-servizi/normative/codice-della-strada/elenco-sigle-province-ditalia.html";
public NoahDistrictCrawler() throws IOException {
super(URL);
}
@Override
public void run() throws NoahCrawlerException {
try {
log.info("Start " + this.getClass().getSimpleName());
List<District> districts = convertTableToDistricts(
getTable("", true));
log.debug("Convertion succesfull!!!");
DistrictAccess access = new DistrictAccess();
access.insertDistricts(districts);
log.info("End " + this.getClass().getSimpleName());
} catch (ObjectNotFoundException e) {
log.error(e.getMessage());
} catch (SQLException e) {
log.error(e.getMessage());
}
}
private List<District> convertTableToDistricts(Table table) {
return new DistrictConverter().convertTable(table);
}
}

View File

@@ -0,0 +1,85 @@
package it.noah.crawler.impl;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import it.noah.crawler.NoahCrawler;
import it.noah.crawler.converter.TownConverter;
import it.noah.crawler.exception.NoahCrawlerException;
import it.noah.crawler.model.Town;
public class NoahTownCrawler implements NoahCrawler {
private static final Logger log = LoggerFactory
.getLogger(NoahTownCrawler.class);
private static final String DEFAULT_PATH = "D:\\Sviluppo\\workspace\\NoahCrawler\\sql\\towns.csv";
private String path;
public NoahTownCrawler() {
path = DEFAULT_PATH;
}
public NoahTownCrawler(String path) {
this.path = path;
}
@Override
public void run() throws NoahCrawlerException {
log.info("Start " + this.getClass().getSimpleName());
List<Town> towns = loadTownsFromCSV(path);
log.info("End " + this.getClass().getSimpleName());
}
private List<Town> loadTownsFromCSV(String path) {
List<Town> towns = new ArrayList<Town>();
BufferedReader bufferedReader = null;
String line = "";
String cvsSplitBy = ",";
try {
bufferedReader = new BufferedReader(new FileReader(path));
while ((line = bufferedReader.readLine()) != null) {
// use comma as separator
String[] townArray = line.split(cvsSplitBy);
Town town = convertRowToTown(townArray);
towns.add(town);
}
System.out.println("Loaded town: " + towns.size());
} catch (FileNotFoundException e) {
log.error(e.getMessage());
} catch (IOException e) {
log.error(e.getMessage());
} finally {
if (bufferedReader != null) {
try {
bufferedReader.close();
} catch (IOException e) {
log.error(e.getMessage());
}
}
}
return towns;
}
private Town convertRowToTown(String[] row) {
return new TownConverter().convertCSVRowToTown(row);
}
public String getPath() {
return this.path;
}
public void setPath(String path) {
this.path = path;
}
}

View File

@@ -1,49 +0,0 @@
package it.noah.crawler.impl;
import java.io.IOException;
import java.sql.SQLException;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import it.noah.crawler.AbstractNoahCrawler;
import it.noah.crawler.NoahCrawler;
import it.noah.crawler.converter.impl.ProvinciaConverter;
import it.noah.crawler.dom.tag.Table;
import it.noah.crawler.enums.UrlEnum;
import it.noah.crawler.exception.NoahCrawlerException;
import it.noah.crawler.exception.ObjectNotFoundException;
import it.noah.crawler.model.Provincia;
import it.noah.crawler.persistence.AccessProvincia;
public class ProvinciaNoahCrawler extends AbstractNoahCrawler
implements NoahCrawler {
private final static Logger log = LoggerFactory
.getLogger(ProvinciaNoahCrawler.class);
public ProvinciaNoahCrawler() throws IOException {
super(UrlEnum.PROVINCE_URL.getUrl());
}
@Override
public void run() throws NoahCrawlerException {
try {
List<Provincia> province = convertTable2Provincia(
getTable("", true));
log.debug("Provincia convertita con successo!!!");
AccessProvincia access = new AccessProvincia();
access.insertProvince(province);
} catch (ObjectNotFoundException e) {
log.error(e.getMessage());
} catch (SQLException e) {
log.error(e.getMessage());
}
}
private List<Provincia> convertTable2Provincia(Table table) {
return new ProvinciaConverter().convertTable(table);
}
}

View File

@@ -1,55 +0,0 @@
package it.noah.crawler.model;
import java.io.Serializable;
public class Comune implements Serializable {
private static final long serialVersionUID = -2171167117875954706L;
private Long id;
private String nome;
private String cap;
private Provincia provincia;
private String regione;
public void setId(Long id) {
this.id = id;
}
public Long getId() {
return id;
}
public String getNome() {
return nome;
}
public void setNome(String nome) {
this.nome = nome;
}
public String getCap() {
return cap;
}
public void setCap(String cap) {
this.cap = cap;
}
public Provincia getProvincia() {
return provincia;
}
public void setProvincia(Provincia provincia) {
this.provincia = provincia;
}
public void setRegione(String regione) {
this.regione = regione;
}
public String getRegione() {
return regione;
}
}

View File

@@ -0,0 +1,56 @@
package it.noah.crawler.model;
import java.io.Serializable;
import java.util.List;
public class District implements Serializable {
private static final long serialVersionUID = 106451135158559443L;
private Long id;
private String name;
private String initial;
private String region;
private List<Town> towns;
public void setId(Long id) {
this.id = id;
}
public Long getId() {
return id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getInitial() {
return initial;
}
public void setInitial(String initial) {
this.initial = initial;
}
public void setTowns(List<Town> towns) {
this.towns = towns;
}
public List<Town> getTowns() {
return towns;
}
public void setRegion(String region) {
this.region = region;
}
public String getRegion() {
return region;
}
}

View File

@@ -1,56 +0,0 @@
package it.noah.crawler.model;
import java.io.Serializable;
import java.util.List;
public class Provincia implements Serializable {
private static final long serialVersionUID = 106451135158559443L;
private Long id;
private String nome;
private String sigla;
private String regione;
private List<Comune> comuni;
public void setId(Long id) {
this.id = id;
}
public Long getId() {
return id;
}
public String getNome() {
return nome;
}
public void setNome(String nome) {
this.nome = nome;
}
public String getSigla() {
return sigla;
}
public void setSigla(String sigla) {
this.sigla = sigla;
}
public void setComuni(List<Comune> comuni) {
this.comuni = comuni;
}
public List<Comune> getComuni() {
return comuni;
}
public void setRegione(String regione) {
this.regione = regione;
}
public String getRegione() {
return regione;
}
}

View File

@@ -0,0 +1,55 @@
package it.noah.crawler.model;
import java.io.Serializable;
public class Town implements Serializable {
private static final long serialVersionUID = -2171167117875954706L;
private Long id;
private String name;
private String postalCode;
private District district;
private String region;
public void setId(Long id) {
this.id = id;
}
public Long getId() {
return id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getPostalCode() {
return postalCode;
}
public void setPostalCode(String postalCode) {
this.postalCode = postalCode;
}
public District getDistrict() {
return district;
}
public void setDistrict(District district) {
this.district = district;
}
public void setRegion(String region) {
this.region = region;
}
public String getRegion() {
return region;
}
}

View File

@@ -6,33 +6,33 @@ import java.sql.ResultSet;
import java.sql.SQLException; import java.sql.SQLException;
import java.util.List; import java.util.List;
import it.noah.crawler.model.Provincia; import it.noah.crawler.model.District;
public class AccessProvincia { public class DistrictAccess {
public void insertProvince(List<Provincia> province) throws SQLException { public void insertDistricts(List<District> districts) throws SQLException {
Connection conn = null; Connection conn = null;
PreparedStatement regioni = null; PreparedStatement regions = null;
PreparedStatement stmt = null; PreparedStatement stmt = null;
ResultSet regione = null; ResultSet region = null;
try { try {
conn = ConnectionFactory.getInstance(); conn = ConnectionFactory.getInstance();
conn.setAutoCommit(false); conn.setAutoCommit(false);
for (Provincia provincia : province) { for (District district : districts) {
regioni = conn.prepareStatement( regions = conn.prepareStatement(
"select id from regioni where nome = ?"); "select id from regioni where nome = ?");
regioni.setString(1, provincia.getRegione()); regions.setString(1, district.getRegion());
regione = regioni.executeQuery(); region = regions.executeQuery();
regione.next(); region.next();
stmt = conn.prepareStatement( stmt = conn.prepareStatement(
"insert into province (idRegione, nome, sigla) values (?, ?, ?)"); "insert into province (idRegione, nome, sigla) values (?, ?, ?)");
stmt.setInt(1, regione.getInt("id")); stmt.setInt(1, region.getInt("id"));
stmt.setString(2, provincia.getNome()); stmt.setString(2, district.getName());
stmt.setString(3, provincia.getSigla()); stmt.setString(3, district.getInitial());
stmt.executeUpdate(); stmt.executeUpdate();
if (stmt != null && regione != null) { if (stmt != null && region != null) {
regione.close(); region.close();
stmt.close(); stmt.close();
} }
} }

View File

@@ -0,0 +1,41 @@
package it.noah.crawler.services;
import java.util.List;
import org.geonames.PostalCode;
import org.geonames.PostalCodeSearchCriteria;
import org.geonames.WebService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class NoahPostalCodeSeeker {
private static final Logger log = LoggerFactory
.getLogger(NoahPostalCodeSeeker.class);
public static String getPostalCodeFromName(String username, String town,
String countryCode) {
String postalCode = null;
try {
WebService.setUserName(username);
PostalCodeSearchCriteria criteria = new PostalCodeSearchCriteria();
criteria.setPlaceName(town);
criteria.setCountryCode(countryCode);
List<PostalCode> result = WebService.postalCodeSearch(criteria);
System.out.println("Finder results: " + result.size());
if (result.size() == 1) {
postalCode = result.get(0).getPostalCode();
} else if (result.size() > 1) {
for (PostalCode postalCodeObj : result) {
}
}
} catch (Exception e) {
log.error(e.getMessage());
}
return postalCode;
}
}

View File

@@ -60,24 +60,27 @@ public class NoahCrawlerServlet extends HttpServlet {
try { try {
if (param == null) { if (param == null) {
log.warn("ATTENZIONE!!! Stanno partendo tutti i crawler"); log.warn("WARNING!!! All crawler started!");
runAllCrawler(); runAllCrawler();
} }
if (CrawlerEnum.PROVINCIA_CRAWLER.name().equalsIgnoreCase(param)) { if (CrawlerEnum.DISTRICT_CRAWLER.name().equalsIgnoreCase(param)) {
runCrawler(CrawlerEnum.PROVINCIA_CRAWLER); runCrawler(CrawlerEnum.DISTRICT_CRAWLER);
} else if (CrawlerEnum.COMUNE_CRAWLER.name() out.println("<h1>Successful Job!!!</h1>");
} else if (CrawlerEnum.TOWN_CRAWLER.name()
.equalsIgnoreCase(param)) { .equalsIgnoreCase(param)) {
runCrawler(CrawlerEnum.COMUNE_CRAWLER); runCrawler(CrawlerEnum.TOWN_CRAWLER);
out.println("<h1>Successful Job!!!</h1>");
} else if ("allCrawler".equalsIgnoreCase(param)) { } else if ("allCrawler".equalsIgnoreCase(param)) {
runAllCrawler(); runAllCrawler();
out.println("<h1>Successful Job!!!</h1>");
} else {
out.println("<h1>Error 500 parameter not valid</h1>");
} }
} catch (NoahCrawlerException | IOException e) { } catch (NoahCrawlerException | IOException e) {
log.error("[ERRORE] " + e.getMessage()); log.error("[ERROR] " + e.getMessage());
out.println("[ERRORE] " + e.getMessage()); out.println("[ERROR] " + e.getMessage());
} }
out.println("<h1>Job eseguito con successo!!!</h1>");
} }
private void runAllCrawler() throws NoahCrawlerException, IOException { private void runAllCrawler() throws NoahCrawlerException, IOException {