Difference between revisions of "Fix ARTigo Dating"

From DHVLab

(Created page with "<source lang="java"> import java.sql.Connection; import java.sql.DriverManager; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; imp...")
 
 
(2 intermediate revisions by the same user not shown)
Line 1: Line 1:
<source lang="java">
+
When accessing the data or creating plots you could for example use:
 +
<syntaxhighlight lang="sql">AVG(date_min,date_max) AS dating</syntaxhighlight>
 +
 
 +
Warning: Extremely unoptimized, slow code :-)
 +
<syntaxhighlight lang="java">
 
import java.sql.Connection;
 
import java.sql.Connection;
 
import java.sql.DriverManager;
 
import java.sql.DriverManager;
Line 177: Line 181:
  
 
}
 
}
</source>
+
</syntaxhighlight>

Latest revision as of 10:10, 20 May 2016

When accessing the data or creating plots you could for example use:

AVG(date_min,date_max) AS dating

Warning: Extremely unoptimized, slow code :-)

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


/* Quick fuzzy fix to convert dating strings in the ARTigo 
 * database to date_min, date_max and an indicator about the 
 * connection between both numbers. 
 * Jh., century etc information is parsed and translated, as well
 * as ranges, estimates,....
 * 
 * (c) Linus Kohl, kohl@munichtrading.com
*/

public class MySQL {
	static final String JDBC_DRIVER = "com.mysql.jdbc.Driver";
	static final String DB_URL = "jdbc:mysql://XX/artigo";
	static final String USER = "XXX";
	static final String PASS = "XXX";
	private static Connection connect = null;
	private static Statement query = null;
	private static ResultSet resultSet = null;

	public static void readDataBase() throws Exception {
		try {
			// load jdbc driver
			Class.forName(JDBC_DRIVER);
			// connect to database
			connect = DriverManager.getConnection(DB_URL, USER, PASS);
			query = connect.createStatement();
			// select all artworks where date are not set
			// get row by row
			PreparedStatement query = connect.prepareStatement(
					"select * from sum_artworks where date_min != \"\"", 
					ResultSet.TYPE_FORWARD_ONLY,
					ResultSet.CONCUR_UPDATABLE);
			query.setFetchSize(Integer.MIN_VALUE);
			// execute query
			resultSet = query.executeQuery();
			// loop over result
			loopResultSet(resultSet);
		} catch (Exception e) {
			throw e;
		} finally {
			close();
		}
	}

	/* Extract all integers from a string 
	 * and return as array */
	public static List<Integer> extractInt(String str) {
		List<Integer> numbers = new ArrayList<Integer>();
		Matcher matcher = Pattern.compile("\\d+").matcher(str);
		while (matcher.find()) {
			numbers.add(Integer.parseInt(matcher.group()));
		}
		return numbers;
	}
	
	/* Loop over rows and and try to fix the date */
	private static void loopResultSet(ResultSet resultSet) throws SQLException {
		while (resultSet.next()) {
			// min dating
			int date_min = 0;
			// max dating
			int date_max = 0;
			// store -,/ or ~ for fuzzy input
			String operator = "";
			// get the dating string
			String dating = resultSet.getString("dating");
			if (dating != null) {
				// transform to lowercase
				dating = dating.toLowerCase();
				try {
					// parse for integers in string
					List<Integer> dates = extractInt(dating);
					// check if string contains century information
					if (dating.contains("jh") ||
					    dating.contains("jhd") ||
					    dating.contains("century") ||
					    dating.contains("ct.") ||
					    dating.contains("th") ||
						dating.contains("jahrhundert")) {
						// get largest integer between 1 and 20
						for (int i = 0; i < dates.size(); i++) {
							if(dates.get(i) >= 1 &&
							   dates.get(i)<= 20) {
							   // date_max = integer * 100 and date_min 100 years earlier 
							   date_max = dates.get(i)*100;
							   date_min = date_max-100;
							}
						}
					} else {
						// no century information found
						switch (dates.size()) {
							// no integer found
							case 0: {
								break;
							}
							case 1: {
								// one integer found, date_min = date_max 
								operator = "=";
								date_min = date_max = dates.get(0);
								break;
							}
							case 2: {
								// set indicator if the dating is before the specified date
								if (dating.contains("before") || 
									dating.contains("vor")) 
									operator = "<";
								// set indicator for /
								if (dating.contains("/"))
									operator = "/";
								// set indicator for -
								if (dating.contains("-"))
									operator = "-";
								// set indicator for fuzzy specification
								if (dating.contains("ca.") || 
									dating.contains("um "))
									operator = "~";
								// set the dates
								date_min = dates.get(0);
								date_max = dates.get(1);
								// fix for special case inputs like 1933/34
								if(date_min >= 100 && date_max <= 100) {
									date_max = date_min+date_max;
								}
								break;
							}
						}
					}
					
					System.out.println("Dating: " + dating + " " + date_min +
							           " " + date_max + " " + operator);
					// set the dates
					resultSet.updateInt("date_min", date_min);
					resultSet.updateInt("date_max", date_max);
					resultSet.updateString("date_info", operator);
					// update the db entry
					resultSet.updateRow();
				} catch (Exception e) {
					e.printStackTrace();
				}
			}
		}
	}


	/* close the connection, query and statements */
	private static void close() {
		try {
			if (resultSet != null) {
				resultSet.close();
			}
			if (query != null) {
				query.close();
			}
			if (connect != null) {
				connect.close();
			}
		} catch (Exception e) {

		}
	}

	public static void main(String args[]) throws Exception {
		readDataBase();
	}

}