혈통 오류를 검색하고 혈통을 세대별로 정리하기

투정이 2009. 11. 30. 14:40

2009. 11. 30. 14:40

혈통 파일을 세대별로 정리하기

지난 번 프로그램에 animal과 sire가 동일한 것, animal과 dam이 동일 한 것, sire가 dma과 동일한 것과 같은

오류를 찾아내는 로직을 추가했다.

프로그램

PROGRAM pedi_ana

! program name : pedi_ana
! programmer : Park Byoungho
! usage : sort_ana
! purpose : read pedigree file, calculate the generation and write pedigree file with generation
! date : 2009.11.31.
! update : 2009.12.3 - If one animal is over 2 times, see if sire(dam)s are different

IMPLICIT NONE

INTEGER, PARAMETER :: I4 = SELECTED_INT_KIND(9) ! 4 byte 정수 정의

! store animla, sire, dam and etc info
TYPE list_node
CHARACTER(LEN = 32) :: key, sire, dam
INTEGER(KIND = I4) :: code ! 읽은 순서
INTEGER :: gen
INTEGER :: sex
INTEGER :: count
INTEGER :: dup_err
TYPE(list_node), POINTER :: next
END TYPE list_node

! to create array of pointer
TYPE list_head
TYPE(list_node), POINTER :: next
END TYPE list_head

! bucket size
INTEGER, PARAMETER :: HASHSIZE = 65536_I4

! define bucket of hash
TYPE(list_head), DIMENSION(HASHSIZE) :: bucket

!main

! data dictionary
INTEGER(KIND = I4) :: i ! for loop when reading pedigree data
INTEGER :: j ! for loop, maximum iteration for making generation
CHARACTER(LEN = 32) :: key, sire, dam ! for reading each data line
TYPE(list_node), POINTER :: node, ptr
INTEGER :: dup_err ! error check for (1)animal = sire, (2)animal = dam, (3)sire = dam
! (4) same animal has different sire(dam)
CHARACTER(LEN = 256) :: in_filename, out_filename
INTEGER :: status ! I/O status
CHARACTER(LEN = 256) :: error_msg ! error message

CALL init_bucket()

CALL get_command_argument(1, in_filename) ! get pedigree data filename from the command line
CALL get_command_argument(2, out_filename) ! get output data filename from the command line

! open data file and store data to hash
OPEN(UNIT = 20, FILE = in_filename, STATUS = 'OLD', ACTION = 'READ', IOSTAT = status, IOMSG = error_msg)

IF (status == 0) THEN ! file open success
i = 1
DO
READ(UNIT = 20, FMT = *, IOSTAT = status) key, sire, dam
IF (status /= 0) EXIT ! end of file

dup_err = 0 ! no error

IF (key == sire) THEN ! error for animal = sire
dup_err = 1
END IF

IF (key == dam) THEN ! error for animal = dam
dup_err = 2
END IF

IF (sire == dam .AND. sire /= '0') THEN ! error for sire = dam
dup_err = 3
END IF

CALL add_node(key, sire, dam, i, 1, 0, dup_err)
i = i + 1
END DO
ELSE ! data file open fail
WRITE (*,'(1X, A, A)') 'File open failed -- error message : ', error_msg
STOP
END IF

CLOSE(20)
! end of making hash

DO j = 1, 50 ! iteration
DO i = 1, HASHSIZE
IF(ASSOCIATED(bucket(i)%next)) THEN
ptr => bucket(i)%next
DO
IF (ASSOCIATED(ptr)) THEN
IF (ptr%sire /= '0') CALL add_gen(ptr%sire, ptr%gen, 2) ! process sire
IF (ptr%dam /= '0') call add_gen(ptr%dam, ptr%gen, 1) ! process dam
!WRITE(*,*) ptr%key
ptr => ptr%next
ELSE
EXIT
END IF
END DO
END IF
END DO ! end of i
END DO ! end of j

CALL write_bucket(out_filename)

CALL clear_bucket()

CONTAINS

FUNCTION hash(key)
INTEGER(KIND = I4) :: hash
CHARACTER(LEN = 32), INTENT(IN) :: key
INTEGER :: i, keylen, uv
INTEGER(KIND = I4) :: hashval

hashval = 5381
keylen = LEN_TRIM(key)
DO i = 1, keylen
!uv = MOD(hashval * 33, 65536_I4)
hashval = MOD(hashval * 33 + IACHAR(key(i:i)), 65536_I4) + 1
END DO
hash = hashval
END FUNCTION

SUBROUTINE init_bucket()
INTEGER :: i
DO i = 1, HASHSIZE
NULLIFY(bucket(i)%next)
END DO
END SUBROUTINE

SUBROUTINE lookup(key, node)
TYPE(list_node), POINTER, INTENT(OUT) :: node
CHARACTER(LEN = 32), INTENT(IN) :: key

node => bucket(hash(key))%next
DO
IF(.NOT. ASSOCIATED(node)) EXIT
IF(node%key == key) EXIT
node => node%next
END DO
END SUBROUTINE lookup

SUBROUTINE add_node(key, sire, dam, code, gen, sex, dup_err)
CHARACTER(LEN = 32), INTENT(IN) :: key, sire, dam
TYPE(list_node), POINTER :: node, tmp
INTEGER(KIND = I4) :: hv, err
INTEGER(KIND = I4), INTENT(IN) :: code
INTEGER, INTENT(IN) :: gen, sex, dup_err

hv = hash(key)

IF(.NOT. ASSOCIATED(bucket(hv)%next)) THEN ! if bucket(hv) is empty, i.e. no data at bucket(hv)
ALLOCATE(node, STAT = err)
IF(err /= 0) STOP 'Error : no memory'

NULLIFY(node%next)
node%key = key
node%sire = sire
node%dam = dam
node%code = code
node%gen = gen
node%sex = sex
node%count = 1
node%dup_err = dup_err
bucket(hv)%next => node
ELSE ! if bucket(hv) is not empty, i.e. some data at bucket(hv)

node => bucket(hv)%next
DO ! see if animal is in the hash
IF(node%key == key) THEN ! if there is already an animal in the hash

node%count = node%count + 1 ! increase the count

IF (node%sire /= sire .OR. node%dam /= dam) THEN ! if new sire(dam) and sire(dam) in the hash are different
node%dup_err = 4
END IF

EXIT
ELSE ! to reach the end of each bucket
IF(ASSOCIATED(node%next)) THEN
node => node%next
ELSE
ALLOCATE(tmp, STAT = err)
IF(err /= 0) STOP 'Error : no memory'
NULLIFY(tmp%next)

tmp%key = key
tmp%sire = sire
tmp%dam = dam
tmp%code = code
tmp%gen = gen
tmp%sex = sex
tmp%count = 1
tmp%dup_err = dup_err
node%next => tmp

EXIT
END IF
END IF
END DO
END IF
END SUBROUTINE

SUBROUTINE add_gen(parent, gen, sex)
CHARACTER(LEN = 32), INTENT(IN) :: parent ! sire or dam for generation addition
INTEGER, INTENT(IN) :: gen, sex ! generation of offsprint, sex of sire or dam
TYPE(list_node), POINTER :: node
CHARACTER(LEN = 32) :: sire = '0', dam = '0'

CALL lookup(parent, node)
IF(ASSOCIATED(node)) THEN ! if there is a sire in the hash
IF ( node%gen < gen + 1) node%gen = gen + 1
IF (node%sex == 0) THEN ! sex is not defined yet
node%sex = sex
ELSE
IF (node%sex /= sex) node%sex = 3 ! sex is strange
END IF
ELSE ! if there is not a sire in the hash
CALL add_node(parent, sire, dam, 0, 1, sex, 0)
END IF

END SUBROUTINE add_gen

SUBROUTINE write_bucket(filename)
CHARACTER(LEN = 40), INTENT(IN) :: filename
INTEGER :: i
TYPE(list_node), POINTER :: node

OPEN(UNIT = 12, FILE = filename, STATUS = 'REPLACE', ACTION = 'WRITE')

DO i = 1, HASHSIZE
IF(ASSOCIATED(bucket(i)%next)) THEN
node => bucket(i)%next
DO
IF (.NOT. ASSOCIATED(node)) EXIT
WRITE(12,*) node%key, node%sire, node%dam, node%code, node%gen, node%sex, node%count, node%dup_err
node => node%next
END DO
END IF
END DO ! end of i

CLOSE(12)
END SUBROUTINE write_bucket

SUBROUTINE del_list(node)
TYPE(list_node), POINTER :: node, tmp
DO
IF (ASSOCIATED(node)) THEN
!WRITE(*,*) node%key, node%code, node%gen, node%sex
tmp => node%next
DEALLOCATE(node)
node => tmp
ELSE
EXIT
END IF
END DO
END SUBROUTINE del_list

SUBROUTINE clear_bucket()
INTEGER :: i
DO i = 1, HASHSIZE
IF(ASSOCIATED(bucket(i)%next)) THEN
!WRITE(*,*) i
CALL del_list(bucket(i)%next)
NULLIFY(bucket(i)%next)
END IF
END DO
END SUBROUTINE clear_bucket

END PROGRAM pedi_ana

샘플 자료

5 1 2

50 0

6 3 4
7 5 6
8 7 9
9 10 12
16 13 14
17 14 15
18 16 17
29 18 19
20 19 21
21 22 23
22 24 0
23 24 0
25 0 29
28 26 27
33 31 32
34 33 0
32 34 35
36 36 0
37 0 37
38 36 36

위 자료를 exam_pedi.dat로 저장

결과 파일

10 0 0 0 3 2 1 0
12 0 0 0 3 1 1 0
13 0 0 0 5 2 1 0
14 0 0 0 5 3 1 0
15 0 0 0 5 1 1 0
16 13 14 7 4 2 1 0
17 14 15 8 4 1 1 0
18 16 17 9 3 2 1 0
19 0 0 0 3 3 1 0
20 19 21 11 1 0 1 0
21 22 23 12 2 1 1 0
22 24 0 13 3 2 1 0
23 24 0 14 3 1 1 0
24 0 0 0 4 2 1 0
25 0 29 15 1 0 1 0
26 0 0 0 2 2 1 0
27 0 0 0 2 1 1 0
28 26 27 16 1 0 1 0
29 18 19 10 2 1 1 0
31 0 0 0 76 2 1 0
32 34 35 19 76 1 1 0
33 31 32 17 76 2 1 0
34 33 0 18 75 2 1 0
35 0 0 0 75 1 1 0
36 36 0 20 51 3 1 1
37 0 37 21 51 1 1 2
38 36 36 22 1 0 1 3
1 0 0 0 4 2 1 0
2 0 0 0 4 1 1 0
3 0 0 0 4 2 1 0
4 0 0 0 4 1 1 0
5 1 2 1 3 2 2 4
6 3 4 3 3 1 1 0
7 5 6 4 2 2 1 0
8 7 9 5 1 0 1 0
9 10 12 6 2 1 1 0

column 순서

1 - animal

2 - sire

3 -dam

4 - 읽은 순서

5 - 세대(세대가 무지 높다면 혈통에 조상이 자식으로 나온다는 얘기, loop)

- 정확히 어느 개체가 문제인지 확인하려면 renumber를 해서 sire, dam이 animal 보다 큰 수가 나오는 것을 찾아야 함

- 여기서는 renumber 하는 것을 생략

6- sex(0 = unknown, 1 = 암, 2 = 수, 3 = error)

- 3이 나왔다는 것은 해당 animal이 sire 컬럼에도 dam 컬럼에도 나온다는 얘기

7 - 몇 번 등장하는 지 횟수

8 - error ( 0: normal, 1: animal = sire, 2 : animal = dam, 3 : sire = dam,

4 : animal existtwice, but has different sire(dam)

정렬(유닉스 sort 명령어를 이용할 것)

결과 파일을 exam_pedi.gen이라고 할 경우

sort -nrk 5 exam_pedi.gen -o exam_pedi.gen.cyc

sort -nrk 6 exam_pedi.gen -o exam_pedi.gen.sex

sort -nrk 7 exam_pedi.gen -o exam_pedi.gen.cnt

sort -nrk 8 exam_pedi.gen -o exam_pedi.gen.dup

Batch 작업

pedi_ana %1 %2
sort -nrk 5 %2 -o %2.cyc
sort -nrk 6 %2 -o %2.sex
sort -nrk7 %2 -o %2.cnt
sort -nrk 8 %2 -o %2.dup

위 내용을 b_pedi_ana.bat로 저장하면 혈통파일 검사하고, 정렬을 한 번에 함

실행은 b_pedi_ana.bat input_file output_file

프로그램 컴파일 및 실행화면

관련 파일

1259939700_pedi_ana.zip

다운로드

1259939700_pedi_ana.zip

'Animal Breeding > Fortran program' 카테고리의 다른 글

Fixed Regression Model - Fortran 프로그램 (0)	2010.01.27
혈통 파일 리넘버링 하기 (0)	2009.12.05
iod of multpel linear regression with class variable (0)	2009.11.03
IOD of multiple linear regression Fortran 95 프로그램 (0)	2009.11.01
IOD of simple linear regression Fortran 95 프로그램 (0)	2009.10.27

Let's Study Animal Breeding

혈통 오류를 검색하고 혈통을 세대별로 정리하기

'Animal Breeding > Fortran program' 카테고리의 다른 글

+ Recent posts

티스토리툴바