Skip to content

Commit 7f91ab8

Browse files
committed
add: Use Case: Clustering with KMeans
1 parent 9e86909 commit 7f91ab8

2 files changed

Lines changed: 7683 additions & 0 deletions

File tree

Lines changed: 340 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,340 @@
1+
package examples;
2+
3+
import jnr.ffi.Pointer;
4+
import jnr.ffi.Memory;
5+
import jnr.ffi.Runtime;
6+
7+
import java.io.*;
8+
import java.util.*;
9+
10+
import static functions.functions.*;
11+
12+
/**
13+
* Demonstrates K-means spatial clustering on populated places.
14+
*
15+
* K-means clustering groups geographic points into K clusters based on proximity.
16+
* Each cluster contains points that are close to each other spatially.
17+
*
18+
* This program:
19+
* 1. Reads populated places from CSV (name, population, location)
20+
* 2. Applies K-means clustering to group places into 10 clusters
21+
* 3. Writes results with cluster assignments to output CSV
22+
*
23+
* SQL Equivalent in PostGIS:
24+
* <pre>
25+
* SELECT name, pop_max, geom,
26+
* ST_ClusterKMeans(geom, 10) OVER () AS cluster
27+
* FROM popplaces;
28+
* </pre>
29+
* https://postgis.net/docs/ST_ClusterKMeans.html
30+
*
31+
* Use Cases:
32+
* - Grouping cities by geographic region
33+
* - Optimizing location-based services
34+
*
35+
* Data Source: Natural Earth 1:10M Populated Places
36+
* https://www.naturalearthdata.com/downloads/10m-cultural-vectors/10m-populated-places/
37+
*/
38+
public class N16_Clustering_KMeans {
39+
40+
// Configuration
41+
private static final int MAX_ROWS = 50000;
42+
private static final int BATCH_SIZE = 1000;
43+
private static final int NUM_CLUSTERS = 10;
44+
private static final int NUM_SAMPLES = 5;
45+
private static final String DATA_DIR = "src/main/java/examples/data/";
46+
private static final String INPUT_FILE = DATA_DIR + "popplaces.csv";
47+
private static final String OUTPUT_FILE = DATA_DIR + "popplaces_clustered.csv";
48+
49+
/**
50+
* Record representing a populated place
51+
*/
52+
static class PopulatedPlace {
53+
String name;
54+
int population;
55+
Pointer geom;
56+
int cluster;
57+
58+
PopulatedPlace(String name, int population, Pointer geom) {
59+
this.name = name;
60+
this.population = population;
61+
this.geom = geom;
62+
this.cluster = -1;
63+
}
64+
}
65+
66+
/**
67+
* Read populated places from CSV file
68+
*/
69+
private static List<PopulatedPlace> readPlaces() throws IOException {
70+
List<PopulatedPlace> places = new ArrayList<>();
71+
int nullCount = 0;
72+
73+
System.out.println("\n" + "=".repeat(60));
74+
System.out.println("Reading populated places from CSV");
75+
System.out.println("=".repeat(60));
76+
System.out.println("Reading (one '*' every " + BATCH_SIZE + " records):");
77+
System.out.print(" ");
78+
79+
try (BufferedReader br = new BufferedReader(new FileReader(INPUT_FILE))) {
80+
// Skip header
81+
String header = br.readLine();
82+
83+
String line;
84+
int lineNumber = 0;
85+
86+
while ((line = br.readLine()) != null && places.size() < MAX_ROWS) {
87+
lineNumber++;
88+
89+
// Parse CSV: name,pop_max,geom
90+
String[] parts = line.split(",", 3);
91+
92+
if (parts.length != 3) {
93+
System.out.println("Record nb. " + lineNumber + " with missing values ignored");
94+
nullCount++;
95+
continue;
96+
}
97+
98+
try {
99+
String name = parts[0].trim();
100+
int population = Integer.parseInt(parts[1].trim());
101+
String geomWKT = parts[2].trim();
102+
103+
// Parse geometry
104+
Pointer geom = geom_in(geomWKT, -1);
105+
106+
if (geom != null) {
107+
places.add(new PopulatedPlace(name, population, geom));
108+
109+
// Print progress marker
110+
if (places.size() % BATCH_SIZE == 0) {
111+
System.out.print("*");
112+
System.out.flush();
113+
}
114+
} else {
115+
nullCount++;
116+
}
117+
} catch (NumberFormatException e) {
118+
nullCount++;
119+
}
120+
}
121+
}
122+
123+
System.out.println("\n\n┌─ READING COMPLETE ───────────────────────");
124+
System.out.println("│ Records read: " + places.size());
125+
System.out.println("│ Incomplete records ignored: " + nullCount);
126+
System.out.println("└──────────────────────────────────────────");
127+
128+
return places;
129+
}
130+
131+
/**
132+
* Apply K-means clustering to the places
133+
*/
134+
private static void applyKMeansClustering(List<PopulatedPlace> places) {
135+
System.out.println("\n" + "=".repeat(60));
136+
System.out.println("Applying K-means Clustering");
137+
System.out.println("=".repeat(60));
138+
System.out.println("Algorithm: K-means");
139+
System.out.println("Number of clusters: " + NUM_CLUSTERS);
140+
System.out.println("Number of points: " + places.size());
141+
142+
long startTime = System.currentTimeMillis();
143+
144+
// Create array of geometry pointers
145+
int count = places.size();
146+
Runtime runtime = Runtime.getSystemRuntime();
147+
Pointer geometriesPtr = Memory.allocate(runtime, count * 8); // 8 bytes per pointer
148+
149+
// Fill array with geometry pointers
150+
for (int i = 0; i < count; i++) {
151+
geometriesPtr.putPointer(i * 8, places.get(i).geom);
152+
}
153+
154+
// Apply K-means clustering
155+
Pointer clustersPtr = geo_cluster_kmeans(geometriesPtr, count, NUM_CLUSTERS);
156+
157+
long elapsed = System.currentTimeMillis() - startTime;
158+
159+
// Read cluster assignments
160+
int[] clusters = new int[count];
161+
clustersPtr.get(0, clusters, 0, count);
162+
163+
// Assign clusters to places
164+
for (int i = 0; i < count; i++) {
165+
places.get(i).cluster = clusters[i];
166+
}
167+
168+
System.out.println("\n┌─ CLUSTERING COMPLETE ────────────────────");
169+
System.out.println("│ Time taken: " + elapsed + " ms");
170+
System.out.println("│ Clusters created: " + NUM_CLUSTERS);
171+
System.out.println("└──────────────────────────────────────────");
172+
}
173+
174+
175+
/**
176+
* Display sample places from each cluster
177+
*/
178+
private static void displaySamplePlaces(List<PopulatedPlace> places) {
179+
System.out.println("\n┌─ SAMPLE PLACES PER CLUSTER ──────────────");
180+
181+
for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) {
182+
System.out.println("│");
183+
System.out.println("│ Cluster " + cluster + ":");
184+
185+
final int currentCluster = cluster;
186+
List<PopulatedPlace> clusterSamples = places.stream()
187+
.filter(p -> p.cluster == currentCluster)
188+
.limit(NUM_SAMPLES) // 5 samples
189+
.toList();
190+
191+
for (PopulatedPlace place : clusterSamples) {
192+
String geomStr = geo_out(place.geom);
193+
System.out.printf("│ - %s (pop: %,d) %s\n",
194+
place.name, place.population, geomStr);
195+
}
196+
197+
if (clusterSamples.isEmpty()) {
198+
System.out.println("│ (No places assigned to this cluster)");
199+
}
200+
}
201+
202+
System.out.println("└──────────────────────────────────────────");
203+
}
204+
205+
/**
206+
* Write clustered results to CSV file
207+
*/
208+
private static void writeResults(List<PopulatedPlace> places) throws IOException {
209+
System.out.println("\n" + "=".repeat(60));
210+
System.out.println("Writing Results to CSV");
211+
System.out.println("=".repeat(60));
212+
213+
try (BufferedWriter writer = new BufferedWriter(new FileWriter(OUTPUT_FILE))) {
214+
// Write header
215+
writer.write("name,pop_max,geom,cluster\n");
216+
217+
// Write data
218+
for (PopulatedPlace place : places) {
219+
String geomStr = geo_out(place.geom);
220+
writer.write(String.format("%s,%d,%s,%d\n",
221+
place.name, place.population, geomStr, place.cluster));
222+
}
223+
}
224+
225+
System.out.println("\n┌─ OUTPUT COMPLETE ────────────────────────");
226+
System.out.println("│ File: " + OUTPUT_FILE);
227+
System.out.println("│ Records written: " + places.size());
228+
System.out.println("└──────────────────────────────────────────");
229+
}
230+
231+
/**
232+
* Explain K-means clustering concept
233+
*/
234+
private static void explainKMeans() {
235+
System.out.println("\n╔═══════════════════════════════════════════════════════════╗");
236+
System.out.println("║ K-MEANS CLUSTERING EXPLAINED ║");
237+
System.out.println("╚═══════════════════════════════════════════════════════════╝");
238+
System.out.println();
239+
System.out.println("What is K-means Clustering?");
240+
System.out.println("──────────────────────────");
241+
System.out.println("K-means groups geographic points into K clusters where each");
242+
System.out.println("point belongs to the cluster with the nearest center (centroid).");
243+
System.out.println();
244+
System.out.println("How it works:");
245+
System.out.println("─────────────");
246+
System.out.println("1. Choose K initial cluster centers (randomly or smart placement)");
247+
System.out.println("2. Assign each point to the nearest cluster center");
248+
System.out.println("3. Recalculate cluster centers (average position of all points)");
249+
System.out.println("4. Repeat steps 2-3 until centers don't move significantly");
250+
System.out.println();
251+
System.out.println("Example with cities:");
252+
System.out.println("───────────────────");
253+
System.out.println();
254+
System.out.println(" Before clustering: After K-means (K=3):");
255+
System.out.println(" ┌──────────────┐ ┌──────────────┐");
256+
System.out.println(" │ • • • • • │ │ A A A B B │");
257+
System.out.println(" │ • • • │ │ A A B │");
258+
System.out.println(" │ • • • │ │ A A B │");
259+
System.out.println(" │ • • • • │ │ C C C C │");
260+
System.out.println(" │ • • • • │ │ C C C C │");
261+
System.out.println(" └──────────────┘ └──────────────┘");
262+
System.out.println();
263+
System.out.println(" Random points 3 clear clusters!");
264+
System.out.println();
265+
System.out.println("Use Cases:");
266+
System.out.println("──────────");
267+
System.out.println("✓ Delivery zones - Group customers by proximity");
268+
System.out.println("✓ Sales territories - Divide regions for sales teams");
269+
System.out.println("✓ Service areas - Emergency services coverage zones");
270+
System.out.println("✓ Urban planning - Identify city districts");
271+
System.out.println("✓ Retail analysis - Store location optimization");
272+
System.out.println();
273+
System.out.println("Advantages:");
274+
System.out.println("───────────");
275+
System.out.println("• Fast and efficient");
276+
System.out.println("• Simple to understand");
277+
System.out.println("• Works well with large datasets");
278+
System.out.println("• Produces compact, spherical clusters");
279+
System.out.println();
280+
System.out.println("Limitations:");
281+
System.out.println("────────────");
282+
System.out.println("• Must specify K (number of clusters) in advance");
283+
System.out.println("• Assumes roughly equal-sized, spherical clusters");
284+
System.out.println("• Sensitive to initial center placement");
285+
System.out.println("• May not work well with irregularly shaped regions");
286+
System.out.println();
287+
}
288+
289+
public static void main(String[] args) {
290+
// Initialize MEOS
291+
meos_initialize();
292+
meos_initialize_timezone("UTC");
293+
294+
try {
295+
System.out.println("\n" + "=".repeat(60));
296+
System.out.println("K-MEANS SPATIAL CLUSTERING DEMONSTRATION");
297+
System.out.println("Populated Places Dataset");
298+
System.out.println("=".repeat(60));
299+
300+
// Explain the concept
301+
explainKMeans();
302+
303+
// Read data
304+
List<PopulatedPlace> places = readPlaces();
305+
306+
if (places.isEmpty()) {
307+
System.err.println("No data loaded. Please ensure " + INPUT_FILE + " exists.");
308+
return;
309+
}
310+
311+
// Apply clustering
312+
applyKMeansClustering(places);
313+
314+
// Display samples
315+
displaySamplePlaces(places);
316+
317+
// Write results
318+
writeResults(places);
319+
320+
System.out.println("\n" + "=".repeat(60));
321+
System.out.println("CLUSTERING DEMONSTRATION COMPLETED!");
322+
System.out.println("=".repeat(60));
323+
System.out.println();
324+
System.out.println("Key Takeaways:");
325+
System.out.println("──────────────");
326+
System.out.println("1. K-means groups points by proximity into K clusters");
327+
System.out.println("2. Useful for creating geographic zones and territories");
328+
System.out.println("3. Fast algorithm suitable for large datasets");
329+
System.out.println("4. Results depend on K value and initial placement");
330+
System.out.println();
331+
332+
} catch (IOException e) {
333+
System.err.println("Error: " + e.getMessage());
334+
e.printStackTrace();
335+
} finally {
336+
// Finalize MEOS
337+
meos_finalize();
338+
}
339+
}
340+
}

0 commit comments

Comments
 (0)