#!/usr/bin/perl
# download fr.wiktionary .bz2 from http://download.wikimedia.org/frwiktionary/20080622/
# save that to disk
# to run actually I am doing now
# [zr@localhost tmp]$ perl wiki.pl --debug wikt.part french_french examples french_english
# using this script can be processed all
# Use of uninitialized value in concatenation (.) or string at wiki.pl line 129, <IN> line 24033409.
# lines ~~
# for postgres do
# psql -d mod < '/tmp/french_french' ;
# psql -d mod < '/tmp/french_english' ;
# mb_posts (in mysql) like this:
# LOAD DATA INFILE "/tmp/examples" INTO TABLE mb_posts
# good luck
#
use Getopt::Long;
use Encode;
use POSIX;
use strict;
use warnings;
# main sections
# -------------
# 1 variable declarations
# 2 subroutines
# 3 get commandline options and specify help statement
# 4 loop through file and process
# 1. variable declarations
my %hash=();
# command line argument variables
my( $debug, $opt_debug, $debug_syn, $opt_debug_syn, $debug_trad, $opt_debug_trad, $debug_lang, $opt_debug_lang);
my( $ENC_IN, $ENC_OUT, $HELP, $opt_help, $opt_enc_in, $opt_enc_out );
# processing-file variables
my ( $do_other_langs, $syn_eng, $syn1, $process_french, $post_id, $english_def) ;
my ( $def_number, $word, $syn, $wordid, $this_french_def, $example, $examplesStr, $new_word ) ;
# 2 subroutines
# 3. get commandline options and maybe print help
########################################################
GetOptions("help", "debug"=> \$opt_debug, "debug_syn"=> \$opt_debug_syn, "debug_lang"=>\$opt_debug_lang, "debug_trad"=> \$opt_debug_trad, "enc_in=s" => \$opt_enc_in, "enc_out=s" => \$opt_enc_out );
$HELP = $opt_help || 0;
$debug = $opt_debug || 0;
$debug_syn = $opt_debug_syn || 0;
$debug_trad = $opt_debug_trad || 0;
$debug_lang = $opt_debug_lang || 0;
$ENC_IN = $opt_enc_in || 'ISO-8859-15';
$ENC_OUT = $opt_enc_out || 'ISO-8859-15';
if (($HELP) || ! defined($ARGV[0]) || ! defined($ARGV[1] || ! defined($ARGV[2]|| ! defined($ARGV[3]) ) )) {
print "\n\nUsage: perl $0 {--help --debug --enc_in --enc_out } wiktionary.xml wiktionary.sql\n";
print "\t* OPTIONS WITHOUT ARGS\n";
print "\t--help: prints this message \n";
print "\t--debug: output various debugging information \n";
print "\n\t* OPTIONS WITH ARGS\n";
print "\t--enc_in: encoding of mysql in file (default utf8) \n";
print "\t--enc_out: encoding of postgres out file (default utf8) \n";
print "\n\t* REQUIRED ARGUMENTS\n";
if (defined ($ARGV[0])) {
print "\twiktionary.xml ($ARGV[0])\n";
} else {
print "\twiktionary.xml (undefined)\n";
}
if (defined ($ARGV[1])) {
print "\tfrench.sql ($ARGV[1])\n";
} else {
print "\tfrench.sql (undefined)\n";
}
if (defined ($ARGV[2])) {
print "\texamples.sql ($ARGV[2])\n";
} else {
print "\texamples.sql (undefined)\n";
}
if (defined ($ARGV[3])) {
print "\tenglish_english ($ARGV[3])\n";
} else {
print "\tenglish_english (undefined)\n";
}
print "\n";
exit 1;
}
open(IN,"<:encoding($ENC_IN)", $ARGV[0]) || die "can't open wiktionary file $ARGV[0]";
open(FR_OUT,">:encoding($ENC_OUT)", $ARGV[1]) || die "can't open outgoing sql file $ARGV[1]";
open(EXAMPLE_OUT,">:encoding($ENC_OUT)", $ARGV[2]) || die "can't open outgoing sql file $ARGV[2]";
open(EN_OUT,">:encoding($ENC_OUT)", $ARGV[3]) || die "can't open outgoing sql file $ARGV[3]";
print FR_OUT "--\n";
print FR_OUT "drop table french_french ;\n";
print FR_OUT " drop sequence french_french_wordid_seq ;\n";
print FR_OUT " create sequence french_french_wordid_seq start 1 ;\n";
print FR_OUT "create table french_french (\n";
print FR_OUT " wordid INT default nextval('french_french_wordid_seq'),\n";
print FR_OUT " word varchar(130),\n";
print FR_OUT " syn varchar(200),\n";
print FR_OUT " def text,\n";
print FR_OUT " posn int, \n";
print FR_OUT " pos varchar(13), \n";
print FR_OUT " submitter varchar(25), \n";
print FR_OUT " doe timestamp without time zone, \n";
print FR_OUT " wordsize smallint);\n";
print FR_OUT "\n";
print FR_OUT "grant all on french_french to modpgwebuser;\n";
print EN_OUT "--\n";
print EN_OUT "drop table french_english ;\n";
print EN_OUT " drop sequence french_english_wordid_seq ;\n";
print EN_OUT " create sequence french_english_wordid_seq start 1 ;\n";
print EN_OUT "create table french_english (\n";
print EN_OUT " wordid INT default nextval('french_english_wordid_seq'),\n";
print EN_OUT " word varchar(130),\n";
print EN_OUT " syn varchar(200),\n";
print EN_OUT " def text,\n";
print EN_OUT " posn int, \n";
print EN_OUT " pos varchar(13), \n";
print EN_OUT " submitter varchar(25), \n";
print EN_OUT " doe timestamp without time zone, \n";
print EN_OUT " wordsize smallint);\n";
print EN_OUT "\n";
print EN_OUT "grant all on french_english to modpgwebuser;\n";
$post_id = 410526;
sub output_word () {
my ( $def_number,$topic_id) ;
for $def_number (keys %hash) {
if ($syn eq '') { print FR_OUT "Insert into french_french (wordid,word,syn,def) values (" . $wordid . ",'" . $word . "','','" . $hash{$def_number}{'french_def'} . "');\n"; }
else { print FR_OUT "Insert into french_french (wordid,word,syn,def) values (" . $wordid . ",'" . $word . "','" . $syn . "','" . $hash{$def_number}{'french_def'} . "');\n"; }
# post_id (auto_increment) forum_id topic_id poster_id poster_name post_text post_time edit_time poster_ip post_status language wordid post_text_sideview
if ( $hash{$def_number}{'example'} ) {
$topic_id = ( $wordid + 8000000);
print EXAMPLE_OUT $post_id . "\t" . '17' . "\t" . $topic_id . "\t" . '594' . "\t" . 'wikt' . "\t" . $hash{$def_number}{'example'} . "\t" . "2008-09-10 23:10:10\t2008-09-10 23:10:10\t127.0.0.1\t0\tfrench\t$wordid\t\n" ;
$post_id++;
}
$wordid++;
}
$def_number = -1 ;
$new_word = 1 ;
$syn = '';
%hash = ();
$syn_eng = '';
}
$wordid = 1 ;
$word = '' ;
$new_word = 2 ;
$process_french = 0 ;
# words are output on encountering each new synonym by calling the function output_word()
while( <IN> ) {
if ($debug_trad && $do_other_langs) { print STDERR $_ ; } # prints out eth below the fr. def.
if (/^#:/) { if ($debug) { print STDERR '\$_ = ' . $_ . "\n"; } }
# do all substitutions we want first
$_ =~ s/{{fig}}/(figurative)/g;
$_ =~ s/{{fam}}/(familier)/g;
$_ =~ s/{{archi}}/(Architecture)/g;
$_ =~ s/{{spec}}/(Spécialement)/g;
$_ =~ s/{{juri}}/(Jurisprudence)/g;
$_ =~ s/{{ext}}/(Par extension)/g;
$_ =~ s/{{hérald}}/(Héraldique)/g;
$_ =~ s/{{péj}}/(Péjoratif)/g;
$_ =~ s/{{zool}}/(Zoologie)/g;
$_ =~ s/{{géog}}/(Géographie)/g;
$_ =~ s/{{spor}}/(Sport)/g;
$_ =~ s/{{cuis}}/(Cuisine)/g;
$_ =~ s/{{part}}/(En particulier)/g;
$_ =~ s/{{néol}}/(Néologisme)/g;
$_ =~ s/{{méton}}/(Par métonymie)/g;
$_ =~ s/{{ell}}/(Par ellipse)/g;
$_ =~ s/{{minéral}}/(Minéralogie)/g;
$_ =~ s/{{métrol}}/(Métrologie)/g;
$_ =~ s/{{mari}}/(Maritime)/g;
$_ =~ s/{{}}/()/g;
s/\(([^\)]+)$/$1/g; #unmatched parens
s/^([^\(]+)\)/$1/g; #unmatched parens
$_ =~ s/{{ling}}/(Linguistique)/g;
if (/{{=(\w{2})=}}$/) { # this word or section is from another language not from french
# for example {{=en=}} is an english word, {{=ja=}} is a japanese word
if ($debug_lang) { print STDERR $word . " " . $1 . "\n"; }
if ($1 ne 'fr') {
$do_other_langs = 0 ;
$process_french = 0 ;
} else {
# the title tag section already set process_french and do_other_langs
}
} elsif ($_ =~ m/^{{-(nom)-\|en}}$/ || $_ =~ m/{{-trad-}}/) { # this ends french section and starts section giving translations etc. of other langs
if ($debug_trad) { print STDERR "trad\n"; }
$do_other_langs = 1 ;
$process_french = 0 ;
next;
} elsif (/<title>(Aide|Discussion Utilisateur|Utilisateur|Wiktionnaire|MediaWiki|Discuter):(.*)<\/title>/) {
$do_other_langs = 0 ;
$process_french = 0 ;
next;
} elsif (/<title>(.*)<\/title>/) {
if ($new_word != 2 ) {
$process_french = 1 ;
$do_other_langs = 0 ;
output_word() ;
} else {
$def_number = -1 ;
$new_word = 1 ;
$do_other_langs = 0 ;
$process_french = 1 ;
$syn = '';
$syn_eng = '';
}
$word = $1;
$word =~ s/'//g; # still leaves the normal apostrophe intact
$word =~ s/’/\\’/g; # still leaves the normal apostrophe intact
if ($debug) { print STDERR 'word = ' . $word . "\n"; }
next;
}
if ($process_french == 1 ) { # we are inside a processable page, start processing with processing the french
if ($_ =~ /^# (.*)$/) { #^# [[ensemble|Ensemble]], [[enchaînement]] de toutes les [[connaissance]]s. <-- is a definition
$def_number++;
$examplesStr = '<br>'; # start it over again for this definition
$this_french_def = $1;
$this_french_def =~ s/{{ucf\|([^}]+)}}/$1/;
$this_french_def =~ s/{{term\|([^}]+)}}/$1/;
$this_french_def =~ s/\{\{vx\}\}/(vieux)/g;
if ( $this_french_def =~ m/\[\[[^\|]+\|/) { # first part is the word-to-be-looked-up
$this_french_def =~ s/\[\[[^\|]+\|//g;
}
$this_french_def =~ s/\[\[//g;
$this_french_def =~ s/\]\]//g;
$this_french_def =~ s/'//g; # still leaves the normal apostrophe intact
$this_french_def =~ s/’/\\’/g; # still leaves the normal apostrophe intact
$hash{$def_number}{'french_def'} = $this_french_def ;
if ($debug) { print STDERR 'french_def = ' . $this_french_def . "\n"; }
$new_word = 0 ;
} elsif (/#:(.*)$/) { #for example: #: '''''Encyclopédie''' des connaissances humaines.''
$example = $1;
$example =~ s/'//g; # still leaves the normal apostrophe intact
$example =~ s/’/\\’/g; # still leaves the normal apostrophe intact
$examplesStr .= '<br> ' . $example ;
$hash{$def_number}{'example'} = $examplesStr;
if ($debug) { print STDERR 'example = ' . $example . "\n"; }
} elsif (/^{{-(verb|nom)-\|fr}}$/) {
if ($syn ne '') { output_word(); } #output if not first
$syn = $1 ;
if ($debug_syn) { print STDERR $_ . 'syn verb|nom-fr = ' . $syn . "\n"; }
} elsif (/^'''$word'''.*(3egroupe)/) { # must be last cuz some exs contain the word itself in triple quotes
$syn .= ' ' . $1 ;
if ($debug_syn) { print STDERR $_ . 'syn 1 = ' . $syn . "\n"; }
} elsif (/^'''$word''' (.*)$/ && $syn_eng eq '') { # must be last cuz some exs contain the word itself in triple quotes
# these also may divide up a defintion (see: voler, which has trans and intrans. synonym entries)
# therefore ... output_word()
if ($syn ne '') { output_word(); } #output if not first
$syn1 = $1 ;
$syn1 =~ s/{{pron\|[^}]+}} ou {{pron\|[^}]+}}//g;
$syn1 =~ s/{{pron\|[^}]+}}//g;
$syn1 =~ s/<!--//;
$syn1 =~ s/{{sp}}/ sing. & pl. /;
$syn1 =~ s/{{inv}}/ invariable /;
$syn1 =~ s/{{p}}/ pl. /;
$syn1 =~ s/{{i}}/ intrans. /;
$syn1 =~ s/{{t}}/ trans. /;
$syn1 =~ s/{{msing}}/ m. sing. /;
$syn1 =~ s/{{fsing}}/ f. sing. /;
$syn1 =~ s/\[\[//g;
$syn1 =~ s/\]\]//g;
$syn1 =~ s/'//g; # still leaves the normal apostrophe intact
$syn .= ' ' . $syn1 ;
$syn =~ s/\{\{//;
$syn =~ s/\}\}//;
if ($debug_syn) { print STDERR $_ . 'syn 2 = ' . $syn . "\n"; }
}
# * {{en}} : {{trad|en|encyclopedia}}, {{trad|en|encyclopaedia}}
} # if ($process_french)
if ($do_other_langs) {
if (/^{{-(nom)-\|en}}$/) {
if ($syn ne '') { output_word(); } #output if not first
$syn_eng = $1 ;
} elsif (/\* \{\{en\}\} : {{trad\|en\|([^}]*)}} (.*)/) {
$english_def = $1 . ' '. $2;
$english_def =~ s/{{vx}}//g;
$english_def =~ s/{{[^t]+tats-Unis}}//g;
$english_def =~ s/{{trad\|en\|//g ;
$english_def =~ s/}}//g ;
$english_def =~ s/’/\\’/g; # still leaves the normal apostrophe intact
$english_def =~ s/'/\\'/g; # still leaves the normal apostrophe intact
if ($debug) { print STDERR 'english_def = ' . $english_def . "\n"; }
print EN_OUT "Insert into french_english (word,syn,def) values ('" . $word . "','" . $syn . "','" . $english_def . "');\n";
#} elsif (/\* {{en}} : {{trad|en|([^}]+)}}$/) {
# $english_def = $1 ;
# if ($debug) { print STDERR 'english_def = ' . $english_def . "\n"; }
}
}
}
output_word() ;
print FR_OUT "update french_french set syn='',posn=1,pos='1',submitter='wikt',doe='2008-09-10 09:22:22',wordsize=1;\n"
return to top