Just in case you still insist on wanting to not throw an error when multiple delimiters is found, here is a working version:
csv.php
<?php
class CSV
{
private $filePath;
private $fileContents;
const ACCEPTABLE_DELIMITERS = '~[#,;:|\ ]~'; // acceptable delimiters
//const EXCLUDED_CHARS = '~[a-zA-Z0-9.\\r\
\\f ]~'; // delimiters can't be characters, numbers or spaces
// Constructor accepting a file path
public function __construct($file)
{
$this->filePath = $file;
// Read the file contents and store it into a private variable
$this->fileContents = file($file);
}
public function getDelimiter()
{
$delimitersData = null;
// Loop through each line in the file, identify the index as the line number and the content of the line as $line
foreach ($this->fileContents as $lineNumber => $line)
{
// Don't parse an empty line, it could lead to weird results
if (!empty($line))
{
$quoted = false;
$delimitersForGivenLine = array();
// Loop through each character in the line
for ($i = 0; $i < strlen($line) - 1; $i++)
{
// Read the character we are currently evaluating
$char = substr($line, $i, 1);
// If the character is a ", set $quoted to its opposite value
// (it starts out as false, so using !$quoted sets it to true, when it encounters another ", it will set it back to false and so on)
if ($char === '"')
{
$quoted = !$quoted;
}
// Check if the character we are evaluation is an Acceptable Delimiter (or is not an Excluded Character)
else if (!$quoted && preg_match(self::ACCEPTABLE_DELIMITERS, $char))
//else if (!$quoted && !preg_match(self::EXCLUDED_CHARS, $char))
{
// Check if the character/delimiter was already found on this line and update its' properties accordingly
if (array_key_exists($char, $delimitersForGivenLine))
{
// Update the count for this delimiter since we just found another occurrence
$delimitersForGivenLine[$char]['count']++;
// Add the content of the line to this delimiter, so we know which delimiter to use on it later (this actually is useless -- I think)
$delimitersForGivenLine[$char]['lines'][$lineNumber] = $line;
}
else
{
// This character/delimiter has not been found previously on this line, so create it
$delimitersForGivenLine[$char]['count'] = 1;
// Assign this delimiter the current line, so we know how to read that line later on
$delimitersForGivenLine[$char]['lines'][$lineNumber] = $line;
}
}
}
// On the first line of the file, this variable will be null, now we need to set it. It will be used for comparing the delimiters of the previous line to the current line
if ($delimitersData === null || empty($delimitersData))
{
$delimitersData = $delimitersForGivenLine;
}
// Verify both the previous line's data and the current line's data have delimiters (otherwise the comparison isn't useful)
else if (count($delimitersData) > 0 && count($delimitersForGivenLine) > 0)
{
// Store the current line's data into a new variable
$newDelimitersByLine = $delimitersForGivenLine;
// Loop through the previous lines delimiters (key is the delimiter character, and value is an array consisting of count and lines)
foreach ($delimitersData as $key => $value)
{
// Verify the previous line's delimiter(s) exist in the current line's evaluation and if they do, verify the counts are the same
// OR check that the previous line's delimiter(s) do not exist in the current line's evaluation
// The point here is to see if we need to merge arrays
// So why not use array_merge()? Good question, because it overwrites the keys of your arrays, and the keys are important to our system
if ((array_key_exists($key, $delimitersForGivenLine) && $delimitersForGivenLine[$key]['count'] === $value['count'])
|| !array_key_exists($key, $delimitersForGivenLine))
{
// This line is for when !array_key_exists($key, $delimitersForGivenLine) evaluates true, it writes the count into the
// new variable for the given delimiter (key)
$newDelimitersByLine[$key]['count'] = $value['count'];
// If the delimiter existed in the previous line, loop through the line numbers, keeping their index and values and
// copy them into the new variable.
if (array_key_exists($key, $delimitersForGivenLine))
{
foreach ($value['lines'] as $lineNumber => $line)
$newDelimitersByLine[$key]['lines'][$lineNumber] = $line;
}
else
{
// Since the delimiter didn't exist in the prior line, just write the lines directly over (we don't need to worry about keeping existing data)
$newDelimitersByLine[$key]['lines'] = $value['lines'];
}
}
}
// Store the merged array so it can be used again for the next line (so it keeps a running count)
$delimitersData = $newDelimitersByLine;
}
}
}
// Sort the array of delimiter data using a custom sort routine and maintaining the key indexes
// This is to put the most frequent delimiter and its data at the top of the array
uasort($delimitersData, "CSV::sortDelimiters");
//Remove delimiters that don't have the exact count as the primary delimiter
$initialCount = null;
$finalDelimiterData = array();
// Loop through each delimiter found in the file ($key is the delimiter character, and $data is the count/lines info)
foreach ($delimitersData as $key => $data)
{
// Since the array is already sorted, we want to read the first delimiter and store it
// All other delimiters will ONLY be stored if their count matches the first delimiter
// (so you can't have a delimiter of ";" that indicates it has 8 counts per line and have a delimiter of ","
//that indicates it has 2 counts per line; the "," simply can't be an accurate delimiter in this case)
if ($initialCount === null)
{
$initialCount = $data['count'];
$finalDelimiterData[$key] = $data;
}
else
{
// Only store the delimiter if the count matches the most frequent found delimiter
if ($initialCount === $data['count'])
$finalDelimiterData[$key] = $data;
}
}
// Return the delimiter information back, so it could be looped through and parsed using str_getcsv
return $finalDelimiterData;
}
// Custom Sort for the Delimiters
public static function sortDelimiters($a, $b)
{
// If the delimiter data for item $a in the array, matches item $b, return 0
if ($a['count'] === $b['count'] && sizeof($a['lines']) === sizeof($b['lines']))
{
return 0;
}
// if $a has more lines associated to it than $b, return -1 so it leaves $a higher than $b,
// otherwise, when $b needs to move up ahead of $a
return sizeof($a['lines']) > sizeof($b['lines']) ? -1 : 1;
}
}
test.php
<?php
include('csv.php');
//$files = array('data.txt', 'comma.txt', 'colon.txt', 'pipe.txt', 'pound.txt', 'semicolon.txt', 'tab.txt', 'email.txt', 'mixture.txt');
$files = array('data.txt', 'mixture.txt');
foreach ($files as $file)
{
$csv = new CSV('files/' . $file);
$delimiterData = $csv->getDelimiter();
$delimiter = key($delimiterData);
echo 'Delimiter for ' . $file . ' is ' . $delimiter . ' (' . ord($delimiter) . ')<br />';
echo '<pre>';
echo var_dump($delimiterData);
echo '</pre><br />';
}
data.txt
abc,111
def; 111
ijk; 222
output
Delimiter for data.txt is ; (59)
array(2) {
[";"]=>
array(2) {
["count"]=>
int(1)
["lines"]=>
array(2) {
[2]=>
string(8) "ijk; 222"
[1]=>
string(10) "def; 111
"
}
}
[","]=>
array(2) {
["count"]=>
int(1)
["lines"]=>
array(1) {
[0]=>
string(9) "abc,111
"
}
}
}
mixture.txt
this|is|"a test"|to|123|see|how|it|works
this; is; "a test"; to; 123; see; how; it; works
123.|can?|you&|see|what|I'm|doing?|eight*|nine
output
Delimiter for mixture.txt is | (124)
array(2) {
["|"]=>
array(2) {
["count"]=>
int(8)
["lines"]=>
array(2) {
[2]=>
string(46) "123.|can?|you&|see|what|I'm|doing?|eight*|nine"
[0]=>
string(42) "this|is|"a test"|to|123|see|how|it|works
"
}
}
[";"]=>
array(2) {
["count"]=>
int(8)
["lines"]=>
array(1) {
[1]=>
string(50) "this; is; "a test"; to; 123; see; how; it; works
"
}
}
}