Setup:
[ol]
[li]Unterverzeichnisse data und texts anlegen.[/li][li]Stoppwörter von hier ( http://solariz.de/649/deutsche-stopwords.htm ) in data/stopwords.txt kopieren, dabei mit Semikolon beginnende Zeilen entfernen.[/li][li]Diverse Texte mit Endung *.txt in texts anlegen. Als Quellen können etwa Nachrichtenseiten dienen.[/li][/ol]
Beispielausgabe: http://people.ermshaus.org/marc/tmp/html.de/textcomp-example.html
[php]<?php
/**
- Copyright 2011 Marc Ermshaus
- This code is free software: you can redistribute it and/or modify it under
- the terms of the GNU General Public License as published by the Free Software
- Foundation, either version 3 of the License, or (at your option) any later
- version.
- This code is distributed in the hope that it will be useful, but WITHOUT ANY
- WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
- A PARTICULAR PURPOSE. See the GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with this code. If not, see http://www.gnu.org/licenses/.
*/
/**
*
-
@version 2011-Feb-13
-
@author Marc Ermshaus http://www.ermshaus.org/
*/
class TextComparator_Item
{
const FROM_FILE = 0x01;
const STRIP_HTML = 0x02;
/** @var array */
protected $stopwords = array();
/** @var int */
protected $minimumTokenLength = 3;
/** @var array */
protected $tokens = array();
/**
*
-
@param string $input
-
@param int $flags
*/
public function loadText($input, $flags = null)
{
if ($flags === null) {
$flags = self::FROM_FILE | self::STRIP_HTML;
}
if ($flags & self::FROM_FILE) {
$input = file_get_contents($input);
}
if ($flags & self::STRIP_HTML) {
$input = strip_tags($input);
}
$this->tokenize($input);
$this->groupByToken();
}
/**
*
-
@param string $input
*/
protected function tokenize($input)
{
// Everything consisting of x+ letters shall be indexed as a word
$input = mb_strtolower($input);
$cleaned = trim(preg_replace(‚/(?:^|(.+?))(?:(\p{L}{‘
. $this->minimumTokenLength . ‚,})|$)/su‘, ’ $2’, $input));
$this->tokens = explode(’ ', $cleaned);
$this->tokens = array_flip($this->tokens);
// Remove stopwords
foreach ($this->tokens as $key => $unused) {
if (isset($this->stopwords[$key])) {
unset($this->tokens[$key]);
}
}
}
/**
*
*/
protected function groupByToken()
{
$grouped = array();
foreach ($this->tokens as $token => $unused) {
if (isset($grouped[$token])) {
$grouped[$token]++;
} else {
$grouped[$token] = 1;
}
}
$this->tokens = $grouped;
}
/**
*
- @return array
*/
public function getStopwords()
{
return $this->stopwords;
}
/**
*
- @param array $stopwords
*/
public function setStopwords(array $stopwords)
{
$this->stopwords = $stopwords;
}
/**
*
- @return int
*/
public function getMinimumTokenLength()
{
return $this->minimumTokenLength;
}
/**
*
- @param int $minimumTokenLength
*/
public function setMinimumTokenLength($minimumTokenLength)
{
$this->minimumTokenLength = (int) $minimumTokenLength;
}
/**
*
- @return array
*/
public function getTokens()
{
return $this->tokens;
}
}
/**
*
// Setup
error_reporting(-1);
mb_internal_encoding(‚UTF-8‘);
header(‚Content-Type: text/html; charset=UTF-8‘);
// Load stopwords. We are using hashed arrays for all expensive calculations
// as they allow for fast lookup (O(1)). This should be a classic time-memory
// tradeoff
$stopwords = array_flip(file(‚./data/stopwords.txt‘,
FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES));
// Initialize comparator instance
$tc = new TextComparator($stopwords);
// Holds all loaded texts
$allTexts = array();
// Load and tokenize all texts in subdirectory
$texts = glob(‚./texts/*.txt‘);
$matrix = array();
foreach ($texts as $file) {
$allTexts[pathinfo($file, PATHINFO_FILENAME)] = $tc->createNewItem($file);
}
// Compare all texts in subdirectory with each other
foreach ($allTexts as $file1 => $t1) {
foreach ($allTexts as $file2 => $t2) {
if ($file1 !== $file2) {
$matrix[$file1][$file2] = $tc->compare($t1, $t2);
} else {
$matrix[$file1][$file2] = ‚-‘;
}
}
}
// Render the results
echo ‚
‘;
echo ‚
‘;
echo ‚‘;
foreach ($matrix as $file1 => $cols) {
foreach ($cols as $file2 => $unused) {
echo ‚‘;
}
break;
}
echo ‚‘;
foreach ($matrix as $file1 => $cols) {
echo ‚
‘;
echo ‚‘;
foreach ($cols as $file2 => $points) {
echo ‚‘;
}
echo '</tr>';
}
echo ‚
Text/CompareTo | ‘
. pathinfo($file2, PATHINFO_FILENAME) . ‚ |
‘ . $file1 . ‚ | ‘ . $points . ‚ |
---|
‘;
if (function_exists(‚xdebug_time_index‘)) {
echo ‚
Current execution time: ’ . round(xdebug_time_index(), 2) . ’ s
‘;
}
// Show top 3 similar texts for each text
foreach ($allTexts as $file1 => $text1) {
$points = array();
foreach ($allTexts as $file2 => $text2) {
if ($file1 !== $file2) {
$points[$file2] = $tc->compare($text1, $text2);
}
}
arsort($points);
$tmp = array_slice($points, 0, 3);
echo '<p>Similar texts for ' . $file1 . ':</p>';
echo '<ol>';
foreach ($tmp as $file => $points) {
echo '<li>' . $file . ' (' . $points . ')</li>';
}
echo '</ol>';
}
if (function_exists(‚xdebug_time_index‘)) {
echo ‚
Current execution time: ’ . round(xdebug_time_index(), 2) . ’ s
‘;
}[/php]
Aktueller Algorithmus zur Berechnung der Punktzahl:
[code]Token := Wort aus einem Text, das notwendige Eigenschaften
erfüllt, um berücksichtigt zu werden (zum Beispiel: mindestens 3 Zeichen lang)
t1 := Originaltext
t2 := Text, dessen Ähnlichkeit zu t1 ermittelt werden soll
- Vergebe für jedes Token aus t1, das in t2 vorkommt, n Punkte.
n ist die Anzahl der Vorkommen dieses Tokens in t2.
- Teile die Gesamtpunktzahl durch die Anzahl der Tokens in t1[/code]
In Schritt 2 wird noch mit 1000 multipliziert, um Ganzzahlen zurückzubekommen.
Effekte:
[ul]
[li]Ein langer t2 erhält eine tendentiell höhere Punktzahl, da die Chance auf Entsprechungen größer ist.[/li][li]Ein kurzer t2 erhält generell niedrige Punktzahlen, da er nur wenig Tokens enthält.[/li][/ul]
Verbesserungsvorschläge zum Algorithmus sind sehr willkommen.
Edit: Irgendwo steckt übrigens offenbar ein Bug.