[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
LSR dump
From: |
Werner LEMBERG |
Subject: |
LSR dump |
Date: |
Thu, 03 Oct 2019 13:02:09 +0200 (CEST) |
Folks,
attached you can find a text dump of the current version of the LSR
database, together with the script that I've written to create it.
Werner
# lsrdump.pl
#
# Written 2019 by Werner Lemberg <address@hidden>
# Convert the LSR database dump available from
#
# http://lsr.di.unimi.it/download/
#
# into plain text, omitting the images.
#
#
# Usage:
#
# perl lsrdump.pl < lsr.mysqldump > lsrdump.txt
use strict;
use warnings;
# Access mysqldump files without the need of mysql tools.
use MySQL::Dump::Parser::XS;
# We open the LSR database dump in binary mode since it contains PNG images.
binmode(STDIN);
my $parser = MySQL::Dump::Parser::XS->new;
my %tables;
my %tags;
my $table;
# Parse input and store all table entries in the `%tables' hash.
while (my $line = <STDIN>) {
my @entries = $parser->parse($line);
my $table_name = $parser->current_target_table();
push @{$tables{$table_name}} => @entries if $table_name;
}
# Access entries of `tag' table and build a hash to map ID numbers onto the
# corresponding tag names.
$table = $tables{"tag"};
foreach my $entry (@{$table}) {
$tags{$entry->{"id"}} = $entry->{"name"};
}
# Access entries of `snippet' table.
$table = $tables{"snippet"};
# Tag fields are called `id_tag0_tag', `id_tag1_tag', etc.
sub add_tag_name {
my ($idx, $entry) = @_;
if (defined($entry->{"id_tag${idx}_tag"})) {
my $tag = $entry->{"id_tag${idx}_tag"};
$entry->{"id_tag${idx}_tag"} = $tags{$tag};
}
}
# Replace tag IDs with tag names.
foreach my $entry (@{$table}) {
foreach my $idx (0 .. 6) {
add_tag_name($idx, $entry);
}
}
# Emit a sorted dump of all snippets.
my @column_names = $parser->columns("snippet");
foreach my $entry (sort { $a->{"id"} <=> $b->{"id"} } @{$table}) {
for my $name (@column_names) {
# Ignore binary data.
next if $name eq "image";
next if $name eq "largeimage";
# Ignore unset fields.
next if !defined($entry->{$name});
my $tag = "$name: ";
print $tag;
my $data = $entry->{$name};
# Make line endings uniform.
$data =~ s/(\015\012?|\012)/\n/g;
# Remove trailing whitespace from every line.
$data =~ s/ +$//gm;
# Remove leading and trailing empty lines.
$data =~ s/^\n+//;
$data =~ s/\n+$//;
# Insert a prefix to indicate continuation lines for nicer reading.
my $prefix = " " x (length($tag) - 2) . "| ";
my $n = 0;
$data =~ s/^/$n++ ? "$prefix" : $&/gme; # Skip the first match.
print "$data\n";
}
print "\n";
}
print "END OF DUMP\n"
# eof
lsr-2019-10-03.txt.xz
Description: Binary data
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- LSR dump,
Werner LEMBERG <=