⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 generate.py

📁 这是一个用与生成数据库测试文件的程序对于网站的测试有很大的帮助
💻 PY
📖 第 1 页 / 共 5 页
字号:
# =============================================================================
# AUSTRALIAN NATIONAL UNIVERSITY OPEN SOURCE LICENSE (ANUOS LICENSE)
# VERSION 1.2
# 
# The contents of this file are subject to the ANUOS License Version 1.2
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at:
# 
#   http://datamining.anu.edu.au/linkage.html
# 
# Software distributed under the License is distributed on an "AS IS"
# basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
# the License for the specific language governing rights and limitations
# under the License.
# 
# The Original Software is: "generate.py"
# 
# The Initial Developer of the Original Software is:
#   Dr Peter Christen (Department of Computer Science, Australian National
#                      University)
# 
# Copyright (C) 2002 - 2005 the Australian National University and
# others. All Rights Reserved.
# 
# Contributors:
# 
# Alternatively, the contents of this file may be used under the terms
# of the GNU General Public License Version 2 or later (the "GPL"), in
# which case the provisions of the GPL are applicable instead of those
# above. The GPL is available at the following URL: http://www.gnu.org/
# If you wish to allow use of your version of this file only under the
# terms of the GPL, and not to allow others to use your version of this
# file under the terms of the ANUOS License, indicate your decision by
# deleting the provisions above and replace them with the notice and
# other provisions required by the GPL. If you do not delete the
# provisions above, a recipient may use your version of this file under
# the terms of any one of the ANUOS License or the GPL.
# =============================================================================
#
# Freely extensible biomedical record linkage (Febrl) - Version 0.3
#
# See: http://datamining.anu.edu.au/linkage.html
#
# =============================================================================

"""Module generate.py - Auxiliary program to create records using various
                        frequency tables and introduce duplicates with errors.

   USAGE:
     python generate.py [output_file] [num_originals] [num_duplicates]
                        [max_duplicate_per_record] [max_modification_per_field]
                        [max_modification_per_record] [distribution]

   ARGUMENTS:
     output_file                  Name of the output file (currently this is a
                                  CSV file).
     num_originals                Number of original records to be created.
     num_duplicates               Number of duplicate records to be created.
     max_duplicate_per_record     The maximal number of duplicates that can be
                                  created for one original record.
     max_modification_per_field   The maximum number of modifications per field
     max_modification_per_record  The maximum number of modifications per
                                  record.
     distribution                 The probability distribution used to create
                                  the duplicates (i.e the number of duplicates
                                  for one original).
                                  Possible are: - uniform
                                                - poisson
                                                - zipf

   DESCRIPTION:
     This program can be used to create a data set with records that contain
     randomly created names and addresses (using frequency files), dates,
     phone numbers, and identifier numbers. Duplicate records will then be
     created following a given probability distribution, with different single
     errors being introduced.

     Various parameters on how theses duplicates are created can be given
     within the program, see below.

     New: It is possible to load dictionaries (look-up table) with misspellings
          that will be used to replace a correct word with a randomly chosen
          misspelling. A user can easily customise this misspelling files.

   TODO:
     - add substitution matrix with character substitution probabilities
       (instead of keyboard based substitutions).

     - Improve performance (loading and creating frequency tables)

     - for each field have a counter num_modifcations in the field dictionary

     - do swap between field first (count as 2 rec. modifications)

     - Allow various probability distributions for fields of type 'date' and
       'iden' (using a new keyword in field dictionaries).

     - Try to find real world error distributions for typographical errors and
       integrate them into the random error creation

     - Add random word spilling between fields (similar to field swapping)
"""

# =============================================================================
# Imports go here

import math
import os
import random
import sets
import string
import sys
import time

# Set this flag to True for verbose output, otherwise to False - - - - - - - -
#
VERBOSE_OUTPUT = True

# =============================================================================
#
# For each field (attribute), a dictionary has to be defined with the following
# keys (probabilities can have values between 0.0 and 1.0, or they can be
# missing - in which case it is assumed they have a value of 0.0):
# - name           The field name to be used when a header is written into the
#                  output file.
# - type           The type of the field. Possible are:
#                  'freq'  (for fields that use a frequency table with field
#                           values)
#                  'date'  (for date fields in a certain range)
#                  'phone' (for phone numbers)
#                  'ident' (for numerical identifier fields in a certain range)
# - char_range     The range of random characters that can be introduced. Can
#                  be one of 'alpha', 'digit', or 'alphanum'.
#
# For fields of type 'freq' the following keys must be given:
# - freq_file      The name of a frequency file.
# - misspell_file  The name of a misspellings file.
#
# For fields of type 'date' the following keys must be given:
# - start_date     A start date, must be a tuple (day,month,year).
# - end_date       A end date, must be a tuple (day,month,year).
#
# For fields of type 'phone' the following keys must be given:
# - area_codes     A list with possible area codes (as strings).
# - num_digits     The number of digits in the phone numbers (without the area
#                  code).
#
# For fields of type 'ident' the following keys must be given:
# - start_id       A start identification number.
# - end_id         An end identification number.
#
# For all fields the following keys must be given:
# - select_prob    Probability of selecting a field for introducing one or
#                  more modifications (set this to 0.0 if no modifications
#                  should be introduced into this field ever). Note: The sum
#                  of these select probabilities over all defined fields must
#                  be 100.
# - misspell_prob  Probability to swap an original value with a randomly
#                  chosen misspelling from the corresponding misspelling
#                  dictionary (can only be set to larger than 0.0 if such a
#                  misspellings dictionary is defined for the given field).
# - ins_prob       Probability to insert a character into a field value.
# - del_prob       Probability to delete a character from a field value.
# - sub_prob       Probability to substitute a character in a field value with
#                  another character.
# - trans_prob     Probability to transpose two characters in a field value.
# - val_swap_prob  Probability to swap the value in a field with another
#                  (randomly selected) value for this field (taken from this
#                  field's look-up table).
# - wrd_swap_prob  Probability to swap two words in a field (given there are
#                  at least two words in a field).
# - spc_ins_prob   Probability to insert a space into a field value (thus
#                  splitting a word).
# - spc_del_prob   Probability to delete a space (if available) in a field (and
#                  thus merging two words).
# - miss_prob      Probability to set a field value to missing (empty).
# - new_val_prob   Probability to insert a new value given the original value
#                  was empty.
#
# Note: The sum over the probabilities ins_prob, del_prob, sub_prob,
#       trans_prob, val_swap_prob, wrd_swap_prob, spc_ins_prob, spc_del_prob,
#       and miss_prob for each defined field must be 1.0; or 0.0 if no
#       modification should be done at all on a given field.
#
# =============================================================================
# Comments about typographical errors and misspellings found in the literature:
#
# Damerau 1964: - 80% are single errors: insert, delete, substitute or
#                                        transpose
#               - Statistic given: 567/964 (59%) substitutions
#                                  153/964 (16%) deletions
#                                   23/964 ( 2%) transpositions
#                                   99/964 (10%) insertions
#                                  122/964 (13%) multiple errors
#
# Hall 1980: - OCR and other automatic devices introduce similar errors of
#              substitutions, deletions and insertions, but not transpositions;
#              frequency and type of errors are characteristics of the device.
#
# Pollock/Zamora 1984: - OCR output contains almost exclusively substitution
#                        errors which ordinarily account for less than 20% of
#                        key boarded misspellings.
#                      - 90-95% of misspellings in raw keyboarding typically
#                        only contain one error.
#                      - Only 7.8% of the first letter of misspellings were
#                        incorrect, compared to 11.7% of the second and 19.2%
#                        of the third.
#                      - Generally assumed that vowels are less important than
#                        consonants.
#                      - The frequency of a misspelling seems to be determined
#                        more by the frequency of it's parent word than by the
#                        difficulty of spelling it.
#                      - Most errors are mechanical (typos), not the result of
#                        poor spelling.
#                      - The more frequent a letter, the more likely it is to
#                        be miskeyed.
#                      - Deletions are similar frequent than transpositions,
#                        but more frequent than insertions and again more
#                        frequent than substitutions.
#
# Pollock/Zamora 1983: - Study of 50,000 nonword errors, 3-4 character
#                        misspellings constitute only 9.2% of total
#                        misspellings, but they generate 40% of miscorrections.
#
# Peterson 1986: In two studies:
#                - Transpose two letters:  2.6% / 13.1%
#                - Insert letter:         18.7% / 20.3%
#                - Delete letter:         31.6% / 34.4%
#                - Substitute letter:     40.0% / 26.9%
#
# Kukich 1990: - Over 63% of errors in TDD conversations occur in words of
#                length 2, 3 or 4.
#
# Kukich 1992: - 13% of non-word spelling errors in a 40,000 corpus of typed
#                conversations involved merging of two words, 2% splitting a
#                word (often at valid forms, "forgot" -> "for got").
#              - Most misspellings seem to be within two characters in length
#                of the correct spelling.
#
# =============================================================================
# Other comments:
#
# - Intuitively, one can assume that long and unfrequent words are more likely
#   to be misspelt than short and common words.
#
# =============================================================================

givenname_dict = {'name':'given_name',
                  'type':'freq',
            'char_range':'alpha',
             'freq_file':'data'+os.sep+'givenname-freq.csv',
           'select_prob':0.10,
         'misspell_file':'data'+os.sep+'givenname-misspell.tbl',
         'misspell_prob':0.30,
              'ins_prob':0.05,
              'del_prob':0.15,
              'sub_prob':0.35,
            'trans_prob':0.05,
         'val_swap_prob':0.02,
         'wrd_swap_prob':0.02,
          'spc_ins_prob':0.01,
          'spc_del_prob':0.01,
             'miss_prob':0.02,
          'new_val_prob':0.02}

surname_dict = {'name':'surname',
                'type':'freq',
          'char_range':'alpha',
           'freq_file':'data'+os.sep+'surname-freq.csv',
         'select_prob':0.15,
       'misspell_file':'data'+os.sep+'surname-misspell.tbl',
       'misspell_prob':0.30,
            'ins_prob':0.10,
            'del_prob':0.10,
            'sub_prob':0.35,
          'trans_prob':0.04,
       'val_swap_prob':0.02,
       'wrd_swap_prob':0.02,
        'spc_ins_prob':0.01,
        'spc_del_prob':0.02,
           'miss_prob':0.02,
        'new_val_prob':0.02}

streetnumber_dict = {'name':'street_number',
                     'type':'freq',
               'char_range':'digit',
                'freq_file':'data'+os.sep+'streetnumber-freq.csv',
              'select_prob':0.10,
                 'ins_prob':0.10,
                 'del_prob':0.15,
                 'sub_prob':0.60,
               'trans_prob':0.05,
            'val_swap_prob':0.05,
            'wrd_swap_prob':0.01,
             'spc_ins_prob':0.00,
             'spc_del_prob':0.00,
                'miss_prob':0.02,
             'new_val_prob':0.02}

address1_dict = {'name':'address_1',
                 'type':'freq',
           'char_range':'alpha',
            'freq_file':'data'+os.sep+'address1-freq.csv',
          'select_prob':0.10,
             'ins_prob':0.10,
             'del_prob':0.15,
             'sub_prob':0.55,
           'trans_prob':0.05,
        'val_swap_prob':0.02,
        'wrd_swap_prob':0.03,
         'spc_ins_prob':0.02,
         'spc_del_prob':0.03,
            'miss_prob':0.04,
         'new_val_prob':0.01}

# Address 2 contains property and institution names - only use rarely
# (set missing probability to a high value)
#
address2_dict = {'name':'address_2',
                 'type':'freq',
           'char_range':'alpha',
            'freq_file':'data'+os.sep+'address2-freq.csv',
          'select_prob':0.10,
             'ins_prob':0.04,
             'del_prob':0.04,
             'sub_prob':0.10,
           'trans_prob':0.02,
        'val_swap_prob':0.03,
        'wrd_swap_prob':0.04,
         'spc_ins_prob':0.02,
         'spc_del_prob':0.01,
            'miss_prob':0.60,
         'new_val_prob':0.10}

suburb_dict = {'name':'suburb',
               'type':'freq',
         'char_range':'alpha',
          'freq_file':'data'+os.sep+'suburb-freq.csv',
        'select_prob':0.10,
      'misspell_file':'data'+os.sep+'suburb-misspell.tbl',
      'misspell_prob':0.40,
           'ins_prob':0.10,
           'del_prob':0.15,
           'sub_prob':0.22,
         'trans_prob':0.04,

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -