Comprehensive plan to migrate from CSV to MySQL database
-- Countries table for normalization
CREATE TABLE countries (
id INT PRIMARY KEY AUTO_INCREMENT,
iso3 CHAR(3) NOT NULL UNIQUE,
country_name VARCHAR(100) NOT NULL,
un_region VARCHAR(50),
barometer_region VARCHAR(50),
regional_hub VARCHAR(100),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
INDEX idx_iso3 (iso3),
INDEX idx_country_name (country_name)
);
-- Pillars lookup table
CREATE TABLE pillars (
id INT PRIMARY KEY AUTO_INCREMENT,
pillar_name VARCHAR(50) NOT NULL UNIQUE,
pillar_description TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- Clusters lookup table
CREATE TABLE clusters (
id INT PRIMARY KEY AUTO_INCREMENT,
cluster_name VARCHAR(100) NOT NULL UNIQUE,
pillar_id INT,
cluster_description TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (pillar_id) REFERENCES pillars(id)
);
-- Action areas lookup table
CREATE TABLE action_areas (
id INT PRIMARY KEY AUTO_INCREMENT,
action_area_name VARCHAR(100) NOT NULL UNIQUE,
cluster_id INT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (cluster_id) REFERENCES clusters(id)
);
-- Main data table (normalized)
CREATE TABLE gdb_data (
id INT PRIMARY KEY AUTO_INCREMENT,
country_id INT NOT NULL,
pillar_id INT NOT NULL,
cluster_id INT NOT NULL,
action_area_id INT,
indicator_id VARCHAR(100),
indicator_name VARCHAR(255),
indicator_type VARCHAR(50),
variable_name VARCHAR(255),
question_type VARCHAR(50),
question_text TEXT,
hierarchy_level TINYINT,
data_type VARCHAR(50),
response TEXT,
response_value DECIMAL(10,2),
normalized_response_value DECIMAL(10,2),
weight DECIMAL(10,4),
score DECIMAL(5,2),
parent VARCHAR(100),
question_order INT,
wib DECIMAL(10,4),
wiaa DECIMAL(10,4),
wic DECIMAL(10,4),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
-- Foreign Keys
FOREIGN KEY (country_id) REFERENCES countries(id),
FOREIGN KEY (pillar_id) REFERENCES pillars(id),
FOREIGN KEY (cluster_id) REFERENCES clusters(id),
FOREIGN KEY (action_area_id) REFERENCES action_areas(id),
-- Indexes for performance
INDEX idx_country_pillar (country_id, pillar_id),
INDEX idx_score (score),
INDEX idx_hierarchy (hierarchy_level),
INDEX idx_indicator (indicator_id),
-- Full-text search index
FULLTEXT idx_search (question_text, response, indicator_name)
);
-- Materialized view for common queries
CREATE TABLE gdb_summary_view (
id INT PRIMARY KEY AUTO_INCREMENT,
country_iso3 CHAR(3),
country_name VARCHAR(100),
pillar_name VARCHAR(50),
cluster_name VARCHAR(100),
action_area_name VARCHAR(100),
avg_score DECIMAL(5,2),
total_indicators INT,
last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
UNIQUE KEY unique_summary (country_iso3, pillar_name, cluster_name, action_area_name)
);
-- Composite indexes for common filter combinations
CREATE INDEX idx_country_pillar_cluster ON gdb_data(country_id, pillar_id, cluster_id);
CREATE INDEX idx_score_hierarchy ON gdb_data(score, hierarchy_level);
CREATE INDEX idx_question_search ON gdb_data(question_text(255));
-- Covering indexes for pagination
CREATE INDEX idx_pagination_cover ON gdb_data(country_id, pillar_id, score, id);
-- Efficient filtering query
SELECT
c.country_name,
p.pillar_name,
cl.cluster_name,
aa.action_area_name,
g.indicator_name,
g.question_text,
g.response,
g.score,
c.un_region
FROM gdb_data g
JOIN countries c ON g.country_id = c.id
JOIN pillars p ON g.pillar_id = p.id
JOIN clusters cl ON g.cluster_id = cl.id
LEFT JOIN action_areas aa ON g.action_area_id = aa.id
WHERE
(c.iso3 = ? OR ? IS NULL)
AND (p.pillar_name = ? OR ? IS NULL)
AND (cl.cluster_name = ? OR ? IS NULL)
AND (g.hierarchy_level = ? OR ? IS NULL)
AND (g.score BETWEEN ? AND ?)
AND (MATCH(g.question_text, g.response, g.indicator_name) AGAINST(? IN BOOLEAN MODE) OR ? IS NULL)
ORDER BY g.score DESC, g.id
LIMIT ? OFFSET ?;
-- Efficient count query for pagination
SELECT COUNT(*) as total_count
FROM gdb_data g
JOIN countries c ON g.country_id = c.id
JOIN pillars p ON g.pillar_id = p.id
JOIN clusters cl ON g.cluster_id = cl.id
WHERE
(c.iso3 = ? OR ? IS NULL)
AND (p.pillar_name = ? OR ? IS NULL)
AND (cl.cluster_name = ? OR ? IS NULL)
AND (g.hierarchy_level = ? OR ? IS NULL)
AND (g.score BETWEEN ? AND ?)
AND (MATCH(g.question_text, g.response, g.indicator_name) AGAINST(? IN BOOLEAN MODE) OR ? IS NULL);
<?php
// import_csv_to_mysql.php
class CSVImporter {
private $pdo;
private $batchSize = 1000;
public function __construct($dsn, $username, $password) {
$this->pdo = new PDO($dsn, $username, $password, [
PDO::ATTR_ERRMODE => PDO::ERRMODE_EXCEPTION,
PDO::ATTR_DEFAULT_FETCH_MODE => PDO::FETCH_ASSOC,
PDO::MYSQL_ATTR_LOCAL_INFILE => true
]);
}
public function importCSV($csvFile) {
$this->pdo->beginTransaction();
try {
// Populate lookup tables first
$this->populateLookupTables($csvFile);
// Import main data
$this->importMainData($csvFile);
// Create summary views
$this->createSummaryViews();
$this->pdo->commit();
echo "Import completed successfully!\n";
} catch (Exception $e) {
$this->pdo->rollback();
throw $e;
}
}
private function populateLookupTables($csvFile) {
// Implementation for populating countries, pillars, clusters, action_areas
// Using INSERT IGNORE to handle duplicates
}
private function importMainData($csvFile) {
// Batch insert implementation with prepared statements
// Converting CSV data to normalized foreign key references
}
}
Migrating from CSV to MySQL will dramatically improve performance, scalability, and user experience. The normalized schema provides data integrity, advanced querying capabilities, and a foundation for future enhancements.
Recommended Timeline: 2-3 weeks for full migration including testing and optimization.