Skip to content

Instantly share code, notes, and snippets.

@shrink0r
Last active August 29, 2015 14:01
Show Gist options
  • Select an option

  • Save shrink0r/82af462b281b4b547f0b to your computer and use it in GitHub Desktop.

Select an option

Save shrink0r/82af462b281b4b547f0b to your computer and use it in GitHub Desktop.

Revisions

  1. shrink0r renamed this gist May 28, 2014. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  2. shrink0r renamed this gist May 28, 2014. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  3. shrink0r revised this gist May 28, 2014. 1 changed file with 14 additions and 0 deletions.
    14 changes: 14 additions & 0 deletions create.sh
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,14 @@
    create index
    ```
    curl -XPUT localhost:9200/localnews/ -d @localnews.index.json
    ```

    create mapping
    ```
    curl -XPUT localhost:9200/localnews/news_item/_mapping -d @localnews.mapping.json
    ```

    bulk index documents
    ```
    curl -XPOST localhost:9200/_bulk -d @localnews.bulk --data-binary
    ```
  4. shrink0r revised this gist May 28, 2014. 2 changed files with 227 additions and 0 deletions.
    54 changes: 54 additions & 0 deletions index.json
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,54 @@
    {
    "settings": {
    "number_of_shards": 2,
    "number_of_replicas": 1,
    "analysis": {
    "analyzer": {
    "DefaultAnalyzer": {
    "type": "custom",
    "tokenizer": "whitespace",
    "filter": [
    "lowercase"
    ]
    },
    "AutoCompleteAnalyzer": {
    "type": "custom",
    "tokenizer": "whitespace",
    "filter": [
    "lowercase",
    "edge"
    ]
    },
    "IcuAnalyzer_DE": {
    "type": "custom",
    "tokenizer": "keyword",
    "filter": [
    "collation_de"
    ]
    }
    },
    "filter": {
    "snowball": {
    "type": "snowball",
    "language": "German2"
    },
    "edge": {
    "type": "edgeNGram",
    "min_gram": 1,
    "max_gram": 10,
    "side": "front"
    },
    "collation_de": {
    "type": "icu_collation",
    "language": "de",
    "country": "DE",
    "alternate": "shifted",
    "caseLevel": true,
    "caseFirst": "upper",
    "numeric": true,
    "hiraganaQuaternaryMode": false
    }
    }
    }
    }
    }
    173 changes: 173 additions & 0 deletions mapping.json
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,173 @@
    {
    "news_item" : {
    "dynamic": false,
    "index_analyzer": "DefaultAnalyzer",
    "search_analyzer": "DefaultAnalyzer",
    "properties" : {
    "title": {
    "type" : "multi_field",
    "fields": {
    "title": {
    "type": "string"
    },
    "suggest": {
    "type": "string",
    "analyzer": "AutoCompleteAnalyzer"
    },
    "raw": {
    "type": "string",
    "analyzer": "IcuAnalyzer_DE"
    }
    }
    },
    "text": {"type": "string"},
    "teaser": {"type": "string"},
    "publish_date": {
    "type": "date",
    "format": "date_optional_time"
    },
    "category": {
    "type" : "multi_field",
    "fields": {
    "category": {
    "type": "string"
    },
    "suggest": {
    "type": "string",
    "analyzer": "AutoCompleteAnalyzer"
    },
    "raw": {
    "type": "string",
    "analyzer": "IcuAnalyzer_DE"
    }
    }
    },
    "tags": {
    "type" : "multi_field",
    "fields": {
    "tags": {
    "type": "string"
    },
    "suggest": {
    "type": "string",
    "analyzer": "AutoCompleteAnalyzer"
    },
    "raw": {
    "type": "string",
    "analyzer": "IcuAnalyzer_DE"
    }
    }
    },
    "location": {
    "type": "object",
    "properties": {
    "coordinates": {"type": "geo_point"},
    "zipcode": {
    "type" : "multi_field",
    "fields": {
    "postCode": {
    "type": "string"
    },
    "raw": {
    "type": "string",
    "analyzer": "IcuAnalyzer_DE"
    }
    }
    },
    "street": {
    "type" : "multi_field",
    "fields": {
    "street": {
    "type": "string"
    },
    "suggest": {
    "type": "string",
    "analyzer": "AutoCompleteAnalyzer"
    },
    "raw": {
    "type": "string",
    "analyzer": "IcuAnalyzer_DE"
    }
    }
    },
    "administrative_district": {
    "type" : "multi_field",
    "fields": {
    "administrative_district": {
    "type": "string"
    },
    "suggest": {
    "type": "string",
    "analyzer": "AutoCompleteAnalyzer"
    },
    "raw": {
    "type": "string",
    "analyzer": "IcuAnalyzer_DE"
    }
    }
    },
    "district": {
    "type" : "multi_field",
    "fields": {
    "district": {
    "type": "string"
    },
    "suggest": {
    "type": "string",
    "analyzer": "AutoCompleteAnalyzer"
    },
    "raw": {
    "type": "string",
    "analyzer": "IcuAnalyzer_DE"
    }
    }
    },
    "neighborhood": {
    "type" : "multi_field",
    "fields": {
    "neighborhood": {
    "type": "string"
    },
    "suggest": {
    "type": "string",
    "analyzer": "AutoCompleteAnalyzer"
    },
    "raw": {
    "type": "string",
    "analyzer": "IcuAnalyzer_DE"
    }
    }
    },
    "city": {
    "type" : "multi_field",
    "fields": {
    "city": {
    "type": "string"
    },
    "raw": {
    "type": "string",
    "analyzer": "IcuAnalyzer_DE"
    }
    }
    },
    "name": {
    "type" : "multi_field",
    "fields": {
    "name": {
    "type": "string"
    },
    "suggest": {
    "type": "string",
    "analyzer": "AutoCompleteAnalyzer"
    },
    "raw": {
    "type": "string",
    "analyzer": "IcuAnalyzer_DE"
    }
    }
    }
    }
    }
    }
    }
    }
  5. shrink0r revised this gist May 28, 2014. No changes.
  6. shrink0r revised this gist May 28, 2014. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion localnews_dataset.md
    Original file line number Diff line number Diff line change
    @@ -16,7 +16,7 @@ One news item has the following fields:
    - **title:text** A news item's title.
    - **text:text** The news item's text/content.
    - **teaser:text** Shorter than text, can be used for teasers...
    - **publishDate:date** Point of time at which an item was published (ISO8601 date format)
    - **publish_date:date** Point of time at which an item was published (ISO8601 date format)
    - **source:text** Holds a news item's source, reporting entity.
    - **category:text** A news item's category; one of: *Polizeimeldungen*, *Kiezleben*, *Kiezkultur*, *Bekanntmachungen*, *Stadtteilentwicklung*
    - **tags:text** A list of arbitrary tags.
  7. shrink0r revised this gist May 28, 2014. 1 changed file with 3 additions and 3 deletions.
    6 changes: 3 additions & 3 deletions localnews_dataset.md
    Original file line number Diff line number Diff line change
    @@ -40,20 +40,20 @@ Below an example news item in the JSON format:
    "lat": 52.48127,
    "lon": 13.43558
    },
    "administrativeDistrict": "Neukölln",
    "administrative_district": "Neukölln",
    "district": "Neukölln",
    "neighborhood": "Neukölln",
    "street": "Karl-Marx-Straße 83",
    "name": "Rathaus Neukölln",
    "city": "Berlin",
    "postCode": "12043"
    "zipcode": "12043"
    },
    "title": "Aktuelle Bebauungspläne liegen im Rathaus aus",
    "tags": [],
    "category": "bekanntmachungen",
    "teaser": "Die Bebauungspläne für die Grundstücke Hermannstraße 134 bis 137A sowie eine Teilfläche des Grundstücks Hermannstraße 133 liegen im Fachbereich Stadtplanung aus. Bewohner können sich hier bis 25. Oktober 2012 über den Stand der Dinge informieren.",
    "text": "Wesentliches Ziel ist die planungsrechtliche Sicherung der vorhandenen Wohnbebauung sowie der Schaffung neuer Wohnbauflächen im Ortsteil Neukölln durch die Festsetzung von Allgemeinen Wohngebieten (WA). Die Erschließung wird mittels privater Verkehrsflächen und Straßenverkehrsflächen gesichert.\nDas Verfahren wird gemäß § 13a des Baugesetzbuchs als beschleunigtes Verfahren ohne Durchführung einer Umweltprüfung nach § 2 Abs. 4 des Baugesetzbuchs durchgeführt.\n\nDie Bürger sind innerhalb der Auslegungszeit aufgefordert, Anregungen vorzubringen. Dabei besteht auch die Möglichkeit, sich unmittelbar online zu beteiligen.\n\nDer Plan kann von Montag bis Donnerstag von 8.30 Uhr bis 16.30 Uhr und freitags von 8.30 Uhr bis 15.30 Uhr im Rathaus Neukölln eingesehen werden.\nDer Bebauungsplanentwurf kann auch im Internet eingesehen werden unter: http://www.berlin.de/ba-neukoelln/verwaltung/bebauungsplaene/bplan.html",
    "source": "Bezirksamt Neukölln",
    "publishDate": "2012-10-09T12:32:31+0200"
    "publish_date": "2012-10-09T12:32:31+0200"
    }
    ```
  8. shrink0r renamed this gist May 28, 2014. 1 changed file with 8 additions and 7 deletions.
    15 changes: 8 additions & 7 deletions Localnews Dataset → localnews_dataset.md
    Original file line number Diff line number Diff line change
    @@ -1,15 +1,16 @@
    LocalNews Dataset
    =========
    # Local News of Berlin (Dataset)

    https://s3-us-west-2.amazonaws.com/elasticsearch-hackfest/localnews.bulk
    Download the raw data here: https://s3-us-west-2.amazonaws.com/elasticsearch-hackfest/localnews.bulk

    This LocalNews dataset contains 21453 records of localnews for Berlin.
    Each item is related to a district and has a (lon/lat) geo point.
    This LocalNews dataset contains 32606 records of local news for Berlin.
    Each item is related to a district and has a (lon/lat) geo coordinate.
    For information on Berlin's district/neighborhood structure see:

    http://berlin.barwick.de/information/districts/index.html

    Further more every item has a category and optional tags.

    Following up a list of the fields that make up a news item:
    One news item has the following fields:

    - **id:text** An id which is unique inside this dataset.
    - **title:text** A news item's title.
    @@ -55,4 +56,4 @@ Below an example news item in the JSON format:
    "source": "Bezirksamt Neukölln",
    "publishDate": "2012-10-09T12:32:31+0200"
    }
    ```
    ```
  9. shrink0r revised this gist May 28, 2014. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions Localnews Dataset
    Original file line number Diff line number Diff line change
    @@ -22,8 +22,8 @@ Following up a list of the fields that make up a news item:
    - **location:object** An object describing a news item's location.
    - **coordinates:geopoint** Holds the WGS84 longitude & latitude values that localize a news item's.
    - **street:text**
    - **postCode:text**
    - **administrativeDistrict:text** An news item's disrict's official/administrative name.
    - **zipcode:text**
    - **administrative_district:text** An news item's disrict's official/administrative name.
    - **district:text** Holds an item's "old" district name (for example Prenzlauer Berg was it's own district not so long ago)
    - **neighborhood:text** Holds the neighborhood in Berlin that a news item relates to. (Grunewald, Niederschönhausen etc.)
    - **city:text**
  10. shrink0r revised this gist May 28, 2014. 1 changed file with 55 additions and 1 deletion.
    56 changes: 55 additions & 1 deletion Localnews Dataset
    Original file line number Diff line number Diff line change
    @@ -1,4 +1,58 @@
    LocalNews Dataset
    =========

    https://s3-us-west-2.amazonaws.com/elasticsearch-hackfest/localnews.bulk

    This LocalNews dataset contains 21453 records of localnews for Berlin.
    Each item is related to a district and has a (lon/lat) geo point.
    For information on Berlin's district/neighborhood structure see:
    http://berlin.barwick.de/information/districts/index.html
    Further more every item has a category and optional tags.

    https://s3-us-west-2.amazonaws.com/elasticsearch-hackfest/localnews.bulk
    Following up a list of the fields that make up a news item:

    - **id:text** An id which is unique inside this dataset.
    - **title:text** A news item's title.
    - **text:text** The news item's text/content.
    - **teaser:text** Shorter than text, can be used for teasers...
    - **publishDate:date** Point of time at which an item was published (ISO8601 date format)
    - **source:text** Holds a news item's source, reporting entity.
    - **category:text** A news item's category; one of: *Polizeimeldungen*, *Kiezleben*, *Kiezkultur*, *Bekanntmachungen*, *Stadtteilentwicklung*
    - **tags:text** A list of arbitrary tags.
    - **location:object** An object describing a news item's location.
    - **coordinates:geopoint** Holds the WGS84 longitude & latitude values that localize a news item's.
    - **street:text**
    - **postCode:text**
    - **administrativeDistrict:text** An news item's disrict's official/administrative name.
    - **district:text** Holds an item's "old" district name (for example Prenzlauer Berg was it's own district not so long ago)
    - **neighborhood:text** Holds the neighborhood in Berlin that a news item relates to. (Grunewald, Niederschönhausen etc.)
    - **city:text**
    - **name:text**

    Below an example news item in the JSON format:

    ```json
    {
    "id": "localnews-10001",
    "location": {
    "coordinates": {
    "lat": 52.48127,
    "lon": 13.43558
    },
    "administrativeDistrict": "Neukölln",
    "district": "Neukölln",
    "neighborhood": "Neukölln",
    "street": "Karl-Marx-Straße 83",
    "name": "Rathaus Neukölln",
    "city": "Berlin",
    "postCode": "12043"
    },
    "title": "Aktuelle Bebauungspläne liegen im Rathaus aus",
    "tags": [],
    "category": "bekanntmachungen",
    "teaser": "Die Bebauungspläne für die Grundstücke Hermannstraße 134 bis 137A sowie eine Teilfläche des Grundstücks Hermannstraße 133 liegen im Fachbereich Stadtplanung aus. Bewohner können sich hier bis 25. Oktober 2012 über den Stand der Dinge informieren.",
    "text": "Wesentliches Ziel ist die planungsrechtliche Sicherung der vorhandenen Wohnbebauung sowie der Schaffung neuer Wohnbauflächen im Ortsteil Neukölln durch die Festsetzung von Allgemeinen Wohngebieten (WA). Die Erschließung wird mittels privater Verkehrsflächen und Straßenverkehrsflächen gesichert.\nDas Verfahren wird gemäß § 13a des Baugesetzbuchs als beschleunigtes Verfahren ohne Durchführung einer Umweltprüfung nach § 2 Abs. 4 des Baugesetzbuchs durchgeführt.\n\nDie Bürger sind innerhalb der Auslegungszeit aufgefordert, Anregungen vorzubringen. Dabei besteht auch die Möglichkeit, sich unmittelbar online zu beteiligen.\n\nDer Plan kann von Montag bis Donnerstag von 8.30 Uhr bis 16.30 Uhr und freitags von 8.30 Uhr bis 15.30 Uhr im Rathaus Neukölln eingesehen werden.\nDer Bebauungsplanentwurf kann auch im Internet eingesehen werden unter: http://www.berlin.de/ba-neukoelln/verwaltung/bebauungsplaene/bplan.html",
    "source": "Bezirksamt Neukölln",
    "publishDate": "2012-10-09T12:32:31+0200"
    }
    ```
  11. shrink0r created this gist May 28, 2014.
    4 changes: 4 additions & 0 deletions Localnews Dataset
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,4 @@



    https://s3-us-west-2.amazonaws.com/elasticsearch-hackfest/localnews.bulk