The Open Source Swiss Army Knife

/code/perl/
/code/perl/ + sub-categories
http://www.sirfsup.com/
web directory content
    
      

Not logged in
Chat Register Login
return to:  http:/www.sirfsup.com      /code   /perl 
Permalink: wiki.perl
Title: parse the wiktionary file with perl
article options : please login   |  raw source view  

#!/usr/bin/perl
# download fr.wiktionary .bz2 from http://download.wikimedia.org/frwiktionary/20080622/
# save that to disk
# to run actually I am doing now 
# [zr@localhost tmp]$ perl wiki.pl --debug wikt.part french_french examples french_english
# using this script can be processed all 
# Use of uninitialized value in concatenation (.) or string at wiki.pl line 129, <IN> line 24033409.
# lines ~~
# for postgres do 
# psql -d mod <  '/tmp/french_french' ; 
# psql -d mod <  '/tmp/french_english' ; 
# mb_posts (in mysql) like this:
# LOAD DATA INFILE "/tmp/examples" INTO TABLE mb_posts   
# good luck 
#

use Getopt::Long;

use Encode; 

use POSIX;

use strict;
use warnings;


# main sections
# -------------
# 1 variable declarations
# 2 subroutines
# 3 get commandline options and specify help statement
# 4 loop through file and process

# 1.  variable declarations
my %hash=();
# command line argument variables
my( $debug, $opt_debug, $debug_syn, $opt_debug_syn, $debug_trad, $opt_debug_trad, $debug_lang, $opt_debug_lang);
my( $ENC_IN, $ENC_OUT, $HELP,  $opt_help, $opt_enc_in, $opt_enc_out );
# processing-file variables
my ( $do_other_langs, $syn_eng, $syn1, $process_french, $post_id, $english_def) ;
my ( $def_number, $word, $syn, $wordid, $this_french_def, $example, $examplesStr, $new_word  ) ;
# 2 subroutines

# 3.  get commandline options and maybe print help
########################################################
GetOptions("help", "debug"=> \$opt_debug, "debug_syn"=> \$opt_debug_syn, "debug_lang"=>\$opt_debug_lang, "debug_trad"=> \$opt_debug_trad, "enc_in=s" => \$opt_enc_in, "enc_out=s" => \$opt_enc_out );

$HELP = $opt_help || 0;
$debug = $opt_debug || 0;
$debug_syn = $opt_debug_syn || 0;
$debug_trad = $opt_debug_trad || 0;
$debug_lang = $opt_debug_lang || 0;
$ENC_IN = $opt_enc_in || 'ISO-8859-15';
$ENC_OUT = $opt_enc_out || 'ISO-8859-15';

if (($HELP) || ! defined($ARGV[0]) || ! defined($ARGV[1] || ! defined($ARGV[2]|| ! defined($ARGV[3]) ) )) {
	print "\n\nUsage: perl $0 {--help --debug --enc_in --enc_out } wiktionary.xml wiktionary.sql\n";
	print "\t* OPTIONS WITHOUT ARGS\n";
	print "\t--help:  prints this message \n";
	print "\t--debug: output various debugging information \n";
	print "\n\t* OPTIONS WITH ARGS\n";
    print "\t--enc_in: encoding of mysql in file (default utf8) \n";
    print "\t--enc_out: encoding of postgres out file (default utf8) \n";
	print "\n\t* REQUIRED ARGUMENTS\n";
	if (defined ($ARGV[0])) {
		print "\twiktionary.xml ($ARGV[0])\n";
	} else {
		print "\twiktionary.xml (undefined)\n";
	}
	if (defined ($ARGV[1])) {
		print "\tfrench.sql ($ARGV[1])\n";
	} else {
		print "\tfrench.sql (undefined)\n";
	}
	if (defined ($ARGV[2])) {
		print "\texamples.sql ($ARGV[2])\n";
	} else {
		print "\texamples.sql (undefined)\n";
	}
	if (defined ($ARGV[3])) {
		print "\tenglish_english ($ARGV[3])\n";
	} else {
		print "\tenglish_english (undefined)\n";
	}
        print "\n";
	exit 1;
}


open(IN,"<:encoding($ENC_IN)", $ARGV[0]) || die "can't open wiktionary file $ARGV[0]";
open(FR_OUT,">:encoding($ENC_OUT)", $ARGV[1]) || die "can't open outgoing sql  file $ARGV[1]";
open(EXAMPLE_OUT,">:encoding($ENC_OUT)", $ARGV[2]) || die "can't open outgoing sql  file $ARGV[2]";
open(EN_OUT,">:encoding($ENC_OUT)", $ARGV[3]) || die "can't open outgoing sql  file $ARGV[3]";

print FR_OUT "--\n";
print FR_OUT "drop table french_french ;\n";
print FR_OUT " drop sequence french_french_wordid_seq ;\n";
print FR_OUT " create sequence french_french_wordid_seq start 1 ;\n";
print FR_OUT "create table french_french (\n";
print FR_OUT "  wordid INT default nextval('french_french_wordid_seq'),\n";
print FR_OUT "  word varchar(130),\n";
print FR_OUT "  syn varchar(200),\n";
print FR_OUT "  def text,\n";
print FR_OUT "  posn int, \n";
print FR_OUT "  pos varchar(13), \n";
print FR_OUT "  submitter varchar(25), \n";
print FR_OUT "  doe timestamp without time zone, \n";
print FR_OUT "  wordsize smallint);\n";
print FR_OUT "\n";
print FR_OUT "grant all on french_french to modpgwebuser;\n";

print EN_OUT "--\n";
print EN_OUT "drop table french_english ;\n";
print EN_OUT " drop sequence french_english_wordid_seq ;\n";
print EN_OUT " create sequence french_english_wordid_seq start 1 ;\n";
print EN_OUT "create table french_english (\n";
print EN_OUT "  wordid INT default nextval('french_english_wordid_seq'),\n";
print EN_OUT "  word varchar(130),\n";
print EN_OUT "  syn varchar(200),\n";
print EN_OUT "  def text,\n";
print EN_OUT "  posn int, \n";
print EN_OUT "  pos varchar(13), \n";
print EN_OUT "  submitter varchar(25), \n";
print EN_OUT "  doe timestamp without time zone, \n";
print EN_OUT "  wordsize smallint);\n";
print EN_OUT "\n";
print EN_OUT "grant all on french_english to modpgwebuser;\n";

$post_id = 410526; 

sub output_word () { 
   my ( $def_number,$topic_id) ; 
   
   for $def_number (keys %hash) { 
     if ($syn eq '') { print FR_OUT "Insert into french_french (wordid,word,syn,def)  values  (" . $wordid . ",'" . $word . "','','" . $hash{$def_number}{'french_def'} . "');\n";  }
     else { print FR_OUT "Insert into french_french (wordid,word,syn,def)  values  (" . $wordid . ",'" . $word . "','" . $syn . "','" . $hash{$def_number}{'french_def'} . "');\n";  } 
     # post_id (auto_increment)           forum_id           topic_id           poster_id          poster_name        post_text          post_time          edit_time          poster_ip          post_status        language           wordid             post_text_sideview 
     if ( $hash{$def_number}{'example'}  ) { 
       $topic_id = ( $wordid + 8000000);
       print EXAMPLE_OUT $post_id . "\t" . '17' . "\t" . $topic_id . "\t" . '594' . "\t" . 'wikt' . "\t" . $hash{$def_number}{'example'} . "\t" . "2008-09-10 23:10:10\t2008-09-10 23:10:10\t127.0.0.1\t0\tfrench\t$wordid\t\n" ; 
       $post_id++;
     }
     $wordid++;
   } 
   $def_number = -1 ; 
   $new_word = 1 ; 
   $syn = '';
   %hash = ();
   $syn_eng = '';
}

$wordid =  1 ; 
$word =  '' ; 
$new_word = 2 ; 
$process_french = 0 ; 
# words are output on encountering each new synonym by calling the function output_word()
while( <IN> ) {
    if ($debug_trad && $do_other_langs) {  print STDERR  $_ ; }   # prints out eth below the fr. def.
    if (/^#:/) { if ($debug) { print STDERR '\$_ = ' . $_ . "\n"; }  } 
    # do all substitutions we want first
    $_ =~ s/{{fig}}/(figurative)/g;  
    $_ =~ s/{{fam}}/(familier)/g;  
    $_ =~ s/{{archi}}/(Architecture)/g;  
    $_ =~ s/{{spec}}/(Spécialement)/g;  
    $_ =~ s/{{juri}}/(Jurisprudence)/g;  
    $_ =~ s/{{ext}}/(Par extension)/g;  
    $_ =~ s/{{hérald}}/(Héraldique)/g;  
    $_ =~ s/{{péj}}/(Péjoratif)/g;  
    $_ =~ s/{{zool}}/(Zoologie)/g;  
    $_ =~ s/{{géog}}/(Géographie)/g;  
    $_ =~ s/{{spor}}/(Sport)/g;  
    $_ =~ s/{{cuis}}/(Cuisine)/g;  
    $_ =~ s/{{part}}/(En particulier)/g;  
    $_ =~ s/{{néol}}/(Néologisme)/g;  
    $_ =~ s/{{méton}}/(Par métonymie)/g;  
    $_ =~ s/{{ell}}/(Par ellipse)/g;  
    $_ =~ s/{{minéral}}/(Minéralogie)/g;  
    $_ =~ s/{{métrol}}/(Métrologie)/g;  
    $_ =~ s/{{mari}}/(Maritime)/g;  
    $_ =~ s/{{}}/()/g;  
    s/\(([^\)]+)$/$1/g;    #unmatched parens
    s/^([^\(]+)\)/$1/g;    #unmatched parens
    $_ =~ s/{{ling}}/(Linguistique)/g;  
    if (/{{=(\w{2})=}}$/) {   # this word or section is from another language not from french
    # for example {{=en=}} is an english word, {{=ja=}} is a japanese word
	if ($debug_lang) { print STDERR $word . " " . $1 . "\n"; } 
	if ($1 ne 'fr') { 
	$do_other_langs = 0 ; 
  	$process_french = 0 ; 
	} else { 
		# the title tag section already set process_french and do_other_langs
	}
    } elsif ($_ =~ m/^{{-(nom)-\|en}}$/ || $_ =~ m/{{-trad-}}/) {   # this ends french section and starts section giving translations etc. of other langs
	if ($debug_trad) { print STDERR "trad\n"; } 
	$do_other_langs = 1 ; 
  	$process_french = 0 ; 
	next; 
    } elsif (/<title>(Aide|Discussion Utilisateur|Utilisateur|Wiktionnaire|MediaWiki|Discuter):(.*)<\/title>/) { 
	$do_other_langs = 0 ; 
  	$process_french = 0 ; 
	next; 
    } elsif (/<title>(.*)<\/title>/) { 
        if ($new_word != 2 ) {  
  	  $process_french = 1 ; 
	  $do_other_langs = 0 ; 
          output_word() ;  
        } else { 
   	  $def_number = -1 ; 
	  $new_word = 1 ; 
	  $do_other_langs = 0 ; 
  	  $process_french = 1 ; 
	  $syn = '';
          $syn_eng = '';
	}
	$word = $1;
	$word =~ s/'//g;  # still leaves the normal apostrophe intact
	$word =~ s/’/\\’/g;  # still leaves the normal apostrophe intact
	if ($debug) { print STDERR 'word = ' . $word . "\n"; } 
	next; 
    }
    if ($process_french == 1 ) { 		# we are inside a processable page, start processing with processing the french
    if ($_ =~ /^# (.*)$/) {  #^# [[ensemble|Ensemble]], [[enchaînement]] de toutes les [[connaissance]]s.  <-- is a definition
	$def_number++;
	$examplesStr = '<br>';  # start it over again for this definition
	$this_french_def = $1;
        $this_french_def =~ s/{{ucf\|([^}]+)}}/$1/; 
        $this_french_def =~ s/{{term\|([^}]+)}}/$1/; 
	$this_french_def =~ s/\{\{vx\}\}/(vieux)/g;
	if ( $this_french_def =~ m/\[\[[^\|]+\|/) {   # first part is the word-to-be-looked-up
	  $this_french_def =~ s/\[\[[^\|]+\|//g;
   	}
	$this_french_def =~ s/\[\[//g;
	$this_french_def =~ s/\]\]//g;
	$this_french_def =~ s/'//g;  # still leaves the normal apostrophe intact
	$this_french_def =~ s/’/\\’/g;  # still leaves the normal apostrophe intact
	$hash{$def_number}{'french_def'} = $this_french_def ;
	if ($debug) { print STDERR 'french_def = ' . $this_french_def . "\n"; } 
        $new_word = 0 ; 
    } elsif (/#:(.*)$/) { #for example:   #: '''''Encyclopédie''' des connaissances humaines.''
	$example  =  $1;
	$example =~ s/'//g;  # still leaves the normal apostrophe intact
	$example =~ s/’/\\’/g;  # still leaves the normal apostrophe intact
	$examplesStr .= '<br> ' . $example ;
	$hash{$def_number}{'example'} = $examplesStr;
        if ($debug) { print STDERR 'example = ' . $example . "\n"; } 
	
    } elsif (/^{{-(verb|nom)-\|fr}}$/) {
	if ($syn ne '') { output_word();  }   #output if not first
 	$syn = $1 ; 
        if ($debug_syn) { print STDERR $_  . 'syn verb|nom-fr = ' . $syn . "\n"; } 
    } elsif (/^'''$word'''.*(3egroupe)/) {  # must be last cuz some exs contain the word itself in triple quotes
 	$syn .= ' ' . $1 ; 
        if ($debug_syn) { print STDERR $_  . 'syn 1 = ' . $syn . "\n"; } 
    } elsif (/^'''$word''' (.*)$/ && $syn_eng eq '') {  # must be last cuz some exs contain the word itself in triple quotes
  	# these also may divide up a defintion (see: voler, which has trans and intrans. synonym entries)
	# therefore ... output_word()
	if ($syn ne '') { output_word();  }   #output if not first
 	$syn1 = $1 ; 
 	$syn1 =~ s/{{pron\|[^}]+}} ou {{pron\|[^}]+}}//g;
 	$syn1 =~ s/{{pron\|[^}]+}}//g;
 	$syn1 =~ s/&lt;!--//;
 	$syn1 =~ s/{{sp}}/ sing. & pl. /;
 	$syn1 =~ s/{{inv}}/ invariable /;
 	$syn1 =~ s/{{p}}/ pl. /;
 	$syn1 =~ s/{{i}}/ intrans. /;
 	$syn1 =~ s/{{t}}/ trans. /;
 	$syn1 =~ s/{{msing}}/ m. sing. /;
 	$syn1 =~ s/{{fsing}}/ f. sing. /;
	$syn1 =~ s/\[\[//g;
	$syn1 =~ s/\]\]//g;
	$syn1 =~ s/'//g;  # still leaves the normal apostrophe intact
 	$syn .= ' ' . $syn1 ; 
        $syn =~ s/\{\{//;
        $syn =~ s/\}\}//;
        if ($debug_syn) { print STDERR $_  . 'syn 2 = ' . $syn . "\n"; } 
    }
#   * {{en}} : {{trad|en|encyclopedia}}, {{trad|en|encyclopaedia}}
    } #  if ($process_french) 
    if ($do_other_langs) { 
    if (/^{{-(nom)-\|en}}$/)  { 
	if ($syn ne '') { output_word();  }   #output if not first
 	$syn_eng = $1 ; 
    } elsif (/\* \{\{en\}\} : {{trad\|en\|([^}]*)}} (.*)/) { 
	 $english_def  = $1  . ' '. $2;
	 $english_def =~ s/{{vx}}//g; 
	 $english_def =~ s/{{[^t]+tats-Unis}}//g; 
	 $english_def  =~ s/{{trad\|en\|//g ;
	 $english_def  =~ s/}}//g ;
	 $english_def =~ s/’/\\’/g;  # still leaves the normal apostrophe intact
	 $english_def =~ s/'/\\'/g;  # still leaves the normal apostrophe intact
         if ($debug) { print STDERR 'english_def = ' . $english_def . "\n"; } 
         print EN_OUT "Insert into french_english (word,syn,def)  values  ('" . $word . "','" . $syn . "','" . $english_def . "');\n"; 
    #} elsif (/\* {{en}} : {{trad|en|([^}]+)}}$/) { 
#	 $english_def  = $1 ;
	# if ($debug) { print STDERR 'english_def = ' . $english_def . "\n"; } 
    }
    } 
}
output_word() ;  

print FR_OUT "update french_french set syn='',posn=1,pos='1',submitter='wikt',doe='2008-09-10 09:22:22',wordsize=1;\n"


Leave a Reply
Your Name:     anonymous
Your Email:
Website:  
Comments:

The author will be notified of your reply.
return to top