Search code examples
c#exceldatagridviewduplicatesoledb

Using dynamic OleDb to read excel in a DataGridView and find duplicity


I am trying to make a report (save an excel) of duplicate users from a previous excel file. The problem is that I want to show the coincidence percentage of duplicate users to save an excel report, but so far I haven't had much luck.

      public DataView DataImport(string archivename)
        {
            string conn = string.Format("Provider = Microsoft.ACE.OLEDB.12.0; Data Source = {0}; Extended Properties = 'Excel 12.0; HDR = YES; IMEX=1'", archivename);
            // Creating a connection object using the above connection string.
            OleDbConnection conector = new OleDbConnection(conn);
            // Opening connection to the database.
            conector.Open();
            OleDbCommand consult = new OleDbCommand("select * FROM [Sheet1$]", conector);
            OleDbDataAdapter adapter = new OleDbDataAdapter
            {
                SelectCommand = consult
            };
            DataSet ds = new DataSet();
            adapter.Fill(ds);
            conector.Close();
            return ds.Tables[0].DefaultView;
        }

        public DataView Duplicatedata(string archivename)
        {
                string conn = string.Format("Provider = Microsoft.ACE.OLEDB.12.0; Data Source = {0}; Extended Properties = 'Excel 12.0; HDR = YES; IMEX=1'", archivename);
                OleDbConnection conector = new OleDbConnection(conn);
                conector.Open();
                OleDbCommand consult = new OleDbCommand("select USER_CODE,NAME, COUNT(*) as DUPLICITY FROM [Sheet1$] GROUP BY USER_CODE,NAME HAVING COUNT(*)>1 ORDER BY COUNT(*) DESC", conector);
                //COUNT(*)>1 sends to count in the whole file all the data that is sent to call in the GROUP BY and will show those that are repeated more than 1, this value can change
                OleDbDataAdapter adapter = new OleDbDataAdapter
                {
                    SelectCommand = consult
                };
                DataSet ds = new DataSet();
                adapter.Fill(ds);
                conector.Close();
                return ds.Tables[0].DefaultView;
        }

       private void btn_import_Click(object sender, EventArgs e)
        {
            try
            {
                string path = "";
                OpenFileDialog openFileDialog = new OpenFileDialog
                {
                    Filter = "Excel | *.xls;*.xlsx;",
                    Title = "Select Archive"
                };

                if (openFileDialog.ShowDialog() == DialogResult.OK)
                {
                    path = openFileDialog.FileName;
                    label3.Text = path;
                    dataGridView1.DataSource = DataImport(openFileDialog.FileName);
                    btn_import.DialogResult= DialogResult.OK;
                }          
            }
            catch (Exception err)
            {
                MessageBox.Show(err.Message);
            }
        }
        private void btn_duplicates_Click(object sender, EventArgs e)
        {
            if (btn_import.DialogResult == DialogResult.OK)
            {
                try
                {
                    dataGridView1.DataSource = Duplicatedata(label3.Text);
                    lbllname.Text = "% Duplicity";
                }
                catch (Exception err)
                {
                    MessageBox.Show(err.Message);
                }
            }
            else
            {
                MessageBox.Show("Import before seeing duplicate data", "ALERT");

            }
        }
}

If another extension rather than OleDb is welcome to be use, is not specifically strict to use only OleDb

Update 8/23/2022 Implementing 100Count()/(Select count(*) From [SHEETx$])

Raw Data Example

Data with duplicity percentage, but the math 100Count()/(Select count(*) From [SHEETx$]) is with all the data, it has to be done with the duplicity count only. any idea is welcome


Solution

  • Microsoft's ACE engine is a little limited when it comes to where you can code subqueries, but you can put one in a data field like this:

    Select USER_CODE,NAME, COUNT(*) as DUPLICITY
        ,100*Count(*)/(Select count(*) From [SHEET3$]) AS PctOfTotal
    From [SHEET3$]
    GROUP BY USER_CODE,NAME
    HAVING COUNT(*)>1
    ORDER BY COUNT(*) DESC
    

    The only thing I added to your original query is an extra field on line 2 above. It includes a simple subquery to total the number of records.

    Adding Complex Filtering to the Above

    To only consider the duplicates in the percentage denominator you'll need to filter them out in a Where clause with a correlated subquery. The complexity is still in the percentage calculation:

    ,100*Count(*)/(
      Select count(*) From [SHEET3$] x
      Where 2 >= (
        Select count(*) From [SHEET3$] y
        Where y.USER_CODE=x.USER_CODE
          and y.NAME=x.NAME)
      )
    ) AS PctOfTotal
    

    This is likely to run pretty slowly, but as long as your spreadsheet isn't huge it should be OK.