Import read_airr error
See original GitHub issueHi Gregor,
Thanks for developing and maintaining this cool tool!
I ran into an error while importing scTCR-seq data with the read_airr function (see below; for the data I sent you a link via email). The cell (cell_id = AACCGAAAGCT) from the error message has three TRB and two TRA sequences assigned to it, but only one functional TRA & TRB sequence that cover the CDR3 region (I attached the igblast output for it below). cell_id_AACCGAAAGCT.txt
Can you reproduce the error? Am I missing something? Thank you in advance!
adata = ir.io.read_airr( ["/home/data/scirpy/OUT_igblast-1-17-1.tsv"] )
WARNING: Non-standard locus name ignored: None
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
<ipython-input-5-78750baef17a> in <module>
----> 1 adata = ir.io.read_airr(
2 ["/home/data/scirpy/OUT_igblast-1-17-1.tsv"]
3 )
/opt/conda/lib/python3.8/site-packages/scirpy/io/_io.py in read_airr(path, use_umi_count_col, infer_locus, cell_attributes, include_fields)
437 tmp_cell.add_chain(chain_dict)
438
--> 439 return from_airr_cells(airr_cells.values(), include_fields=include_fields)
440
441
/opt/conda/lib/python3.8/site-packages/scirpy/io/_convert_anndata.py in from_airr_cells(airr_cells, include_fields)
74
75 """
---> 76 ir_df = pd.DataFrame.from_records(
77 (x.to_scirpy_record(include_fields=include_fields) for x in airr_cells)
78 )
/opt/conda/lib/python3.8/site-packages/pandas/core/frame.py in from_records(cls, data, index, exclude, columns, coerce_float, nrows)
1824
1825 if nrows is None:
-> 1826 values += data
1827 else:
1828 values.extend(itertools.islice(data, nrows - 1))
/opt/conda/lib/python3.8/site-packages/scirpy/io/_convert_anndata.py in <genexpr>(.0)
75 """
76 ir_df = pd.DataFrame.from_records(
---> 77 (x.to_scirpy_record(include_fields=include_fields) for x in airr_cells)
78 )
79 if ir_df.shape[0] > 0:
/opt/conda/lib/python3.8/site-packages/scirpy/io/_datastructures.py in to_scirpy_record(self, include_fields)
329 ), f"There can't be a secondary chain if there is no primary one: {res_dict}"
330 if _is_nan_or_missing("IR_VDJ_1_junction_aa"):
--> 331 assert _is_nan_or_missing(
332 "IR_VDJ_2_junction_aa"
333 ), f"There can't be a secondary chain if there is no primary one: {res_dict}"
AssertionError: There can't be a secondary chain if there is no primary one: {'multi_chain': True, 'extra_chains': '[{"PRFREQ": "0.830508475", "UMI_sequence": "CGTTACCC", "cdr1": "GACAGCA", "cdr1_aa": "DS", "cdr1_end": 59, "cdr1_start": 52, "cdr2": null, "cdr2_aa": null, "cdr2_end": null, "cdr2_start": null, "cdr3": null, "cdr3_aa": null, "cdr3_end": null, "cdr3_start": null, "consensus_count": 49, "d_alignment_end": null, "d_alignment_start": null, "d_call": null, "d_cigar": null, "d_germline_alignment": null, "d_germline_alignment_aa": null, "d_germline_end": null, "d_germline_start": null, "d_identity": null, "d_score": null, "d_sequence_alignment": null, "d_sequence_alignment_aa": null, "d_sequence_end": null, "d_sequence_start": null, "d_support": null, "duplicate_count": 1, "fwr1": "CTCTGACAGTCTGGGAAGGAGAGACCGCAATTCTGAACTGCAGTTATGAG", "fwr1_aa": "LTVWEGETAILNCSYE", "fwr1_end": 52, "fwr1_start": 2, "fwr2": null, "fwr2_aa": null, "fwr2_end": null, "fwr2_start": null, "fwr3": null, "fwr3_aa": null, "fwr3_end": null, "fwr3_start": null, "fwr4": null, "fwr4_aa": null, "fwr4_end": null, "fwr4_start": null, "germline_alignment": "CTCTGACAGTCTGGGAAGGAGAGACCGCAATTCTGAACTGCAGTTATGAGGACAGCANNNNNTAACAGAATCTTCTTTGGTGATGGGACGCAGCTGGTGGTGAAGCCCA", "germline_alignment_aa": "LTVWEGETAILNCSYEDSXX*QNLLW*WDAAGGEA", "j_alignment_end": 109, "j_alignment_start": 62, "j_call": "TRAJ31*01,TRAJ31*02", "j_cigar": "64S10N47M", "j_germline_alignment": "TAACAGAATCTTCTTTGGTGATGGGACGCAGCTGGTGGTGAAGCCCA", "j_germline_alignment_aa": "*QNLLW*WDAAGGEA", "j_germline_end": 57, "j_germline_start": 10, "j_identity": 100.0, "j_score": 91.054, "j_sequence_alignment": "TAACAGAATCTTCTTTGGTGATGGGACGCAGCTGGTGGTGAAGCCCA", "j_sequence_alignment_aa": "*QNLLW*WDAAGGEA", "j_sequence_end": 111, "j_sequence_start": 64, "j_support": 1.69e-19, "junction": null, "junction_aa": null, "junction_aa_length": null, "junction_length": null, "locus": "TRA", "np1": "AGGGG", "np1_length": 5, "np2": null, "np2_length": null, "productive": false, "rev_comp": true, "sequence": "AGCTCTGACAGTCTGGGAAGGAGAGACCGCAATTCTGAACTGCAGTTATGAGGACAGCAAGGGGTAACAGAATCTTCTTTGGTGATGGGACGCAGCTGGTGGTGAAGCCCA", "sequence_alignment": "CTCTGACAGTCTGGGAAGGAGAGACCGCAATTCTGAACTGCAGTTATGAGGACAGCAAGGGGTAACAGAATCTTCTTTGGTGATGGGACGCAGCTGGTGGTGAAGCCCA", "sequence_alignment_aa": "LTVWEGETAILNCSYEDSKG*QNLLW*WDAAGGEA", "sequence_id": "CGTTACCC|CELL_ID=AACCGAAAGCT|PRFREQ=0.8305084745762712|CONSCOUNT=49|DUPCOUNT=1", "stop_codon": true, "v_alignment_end": 57, "v_alignment_start": 0, "v_call": "TRAV14-1*01,TRAV14N-1*01", "v_cigar": "2S28N57M52S195N", "v_germline_alignment": "CTCTGACAGTCTGGGAAGGAGAGACCGCAATTCTGAACTGCAGTTATGAGGACAGCA", "v_germline_alignment_aa": "LTVWEGETAILNCSYEDS", "v_germline_end": 85, "v_germline_start": 28, "v_identity": 100.0, "v_score": 90.649, "v_sequence_alignment": "CTCTGACAGTCTGGGAAGGAGAGACCGCAATTCTGAACTGCAGTTATGAGGACAGCA", "v_sequence_alignment_aa": "LTVWEGETAILNCSYEDS", "v_sequence_end": 59, "v_sequence_start": 2, "v_support": 3.89e-18, "vj_in_frame": false}, {"PRFREQ": "0.969798658", "UMI_sequence": "CAGTGCTC", "cdr1": "TCAGGACATAGTGCT", "cdr1_aa": "SGHSA", "cdr1_end": 75, "cdr1_start": 60, "cdr2": null, "cdr2_aa": null, "cdr2_end": null, "cdr2_start": null, "cdr3": null, "cdr3_aa": null, "cdr3_end": null, "cdr3_start": null, "consensus_count": 578, "d_alignment_end": null, "d_alignment_start": null, "d_call": null, "d_cigar": null, "d_germline_alignment": null, "d_germline_alignment_aa": null, "d_germline_end": null, "d_germline_start": null, "d_identity": null, "d_score": null, "d_sequence_alignment": null, "d_sequence_alignment_aa": null, "d_sequence_end": null, "d_sequence_start": null, "d_support": null, "duplicate_count": 1, "fwr1": "AAGNGACAGGAAGCAACTCTGTGGTGTGAGCCAATT", "fwr1_aa": "KXQEATLWCEPI", "fwr1_end": 60, "fwr1_start": 24, "fwr2": "GTTTTCTGGTACAGACAGACCA", "fwr2_aa": "VFWYRQT", "fwr2_end": 97, "fwr2_start": 75, "fwr3": null, "fwr3_aa": null, "fwr3_end": null, "fwr3_start": null, "fwr4": null, "fwr4_aa": null, "fwr4_end": null, "fwr4_start": null, "germline_alignment": "AAGGGACAAGAAGCAACTCTGTGGTGTGAGCCAATTTCAGGACATAGTGCTGTTTTCTGGTACAGACAGACCANNNNTAACTATGCTGAGCAGTTCTTCGGACCAGGGACACGACTCACCGTCCTAG", "germline_alignment_aa": "KGQEATLWCEPISGHSAVFWYRQTXXNYAEQFFGPGTRLTVL", "j_alignment_end": 127, "j_alignment_start": 77, "j_call": "TRBJ2-1*01", "j_cigar": "101S50M", "j_germline_alignment": "TAACTATGCTGAGCAGTTCTTCGGACCAGGGACACGACTCACCGTCCTAG", "j_germline_alignment_aa": "NYAEQFFGPGTRLTVL", "j_germline_end": 50, "j_germline_start": 0, "j_identity": 100.0, "j_score": 96.822, "j_sequence_alignment": "TAACTATGCTGAGCAGTTCTTCGGACCAGGGACACGACTCACCGTCCTAG", "j_sequence_alignment_aa": "NYAEQFFGPGTRLTVL", "j_sequence_end": 151, "j_sequence_start": 101, "j_support": 4.34e-21, "junction": null, "junction_aa": null, "junction_aa_length": null, "junction_length": null, "locus": "TRB", "np1": "CATA", "np1_length": 4, "np2": null, "np2_length": null, "productive": true, "rev_comp": true, "sequence": "NCTCGTNGGCTCGGNGATGTGTATAAGNGACAGGAAGCAACTCTGTGGTGTGAGCCAATTTCAGGACATAGTGCTGTTTTCTGGTACAGACAGACCACATATAACTATGCTGAGCAGTTCTTCGGACCAGGGACACGACTCACCGTCCTAG", "sequence_alignment": "AAGNGACAGGAAGCAACTCTGTGGTGTGAGCCAATTTCAGGACATAGTGCTGTTTTCTGGTACAGACAGACCACATATAACTATGCTGAGCAGTTCTTCGGACCAGGGACACGACTCACCGTCCTAG", "sequence_alignment_aa": "KXQEATLWCEPISGHSAVFWYRQTTYNYAEQFFGPGTRLTVL", "sequence_id": "CAGTGCTC|CELL_ID=AACCGAAAGCT|PRFREQ=0.9697986577181208|CONSCOUNT=578|DUPCOUNT=1", "stop_codon": false, "v_alignment_end": 73, "v_alignment_start": 0, "v_call": "TRBV16*01,TRBV16*02,TRBV16*04", "v_cigar": "24S42N73M54S175N", "v_germline_alignment": "AAGGGACAAGAAGCAACTCTGTGGTGTGAGCCAATTTCAGGACATAGTGCTGTTTTCTGGTACAGACAGACCA", "v_germline_alignment_aa": "KGQEATLWCEPISGHSAVFWYRQT", "v_germline_end": 115, "v_germline_start": 42, "v_identity": 97.26, "v_score": 109.346, "v_sequence_alignment": "AAGNGACAGGAAGCAACTCTGTGGTGTGAGCCAATTTCAGGACATAGTGCTGTTTTCTGGTACAGACAGACCA", "v_sequence_alignment_aa": "KXQEATLWCEPISGHSAVFWYRQT", "v_sequence_end": 97, "v_sequence_start": 24, "v_support": 1.32e-23, "vj_in_frame": true}]', 'cell_id': 'AACCGAAAGCT', 'IR_VJ_1_consensus_count': 785, 'IR_VJ_2_consensus_count': None, 'IR_VDJ_1_consensus_count': 8396, 'IR_VDJ_2_consensus_count': 2927, 'IR_VJ_1_d_call': None, 'IR_VJ_2_d_call': None, 'IR_VDJ_1_d_call': None, 'IR_VDJ_2_d_call': 'TRBD2*01', 'IR_VJ_1_duplicate_count': 4, 'IR_VJ_2_duplicate_count': None, 'IR_VDJ_1_duplicate_count': 2, 'IR_VDJ_2_duplicate_count': 1, 'IR_VJ_1_j_call': 'TRAJ31*01,TRAJ31*02', 'IR_VJ_2_j_call': None, 'IR_VDJ_1_j_call': 'TRBJ2-1*01', 'IR_VDJ_2_j_call': 'TRBJ2-4*01', 'IR_VJ_1_junction': 'TGTGCAGCAAGGGGTAACAGAATCTTCTTT', 'IR_VJ_2_junction': None, 'IR_VDJ_1_junction': None, 'IR_VDJ_2_junction': 'TGTGCCAGCTCTCTCGGGGGGGAGAGTCAAAACACCTTGTACTTT', 'IR_VJ_1_junction_aa': 'CAARGNRIFF', 'IR_VJ_2_junction_aa': None, 'IR_VDJ_1_junction_aa': None, 'IR_VDJ_2_junction_aa': 'CASSLGGESQNTLYF', 'IR_VJ_1_locus': 'TRA', 'IR_VJ_2_locus': None, 'IR_VDJ_1_locus': 'TRB', 'IR_VDJ_2_locus': 'TRB', 'IR_VJ_1_productive': True, 'IR_VJ_2_productive': None, 'IR_VDJ_1_productive': True, 'IR_VDJ_2_productive': True, 'IR_VJ_1_v_call': 'TRAV14-1*01,TRAV14-2*01,TRAV14-2*03', 'IR_VJ_2_v_call': None, 'IR_VDJ_1_v_call': 'TRBV16*01,TRBV16*02,TRBV16*04', 'IR_VDJ_2_v_call': 'TRBV12-2*01', 'has_ir': True}
Package versions: scirpy 0.7.1 scanpy 1.8.1 pandas 1.2.0 anndata 0.7.6
Issue Analytics
- State:
- Created 2 years ago
- Comments:7
Thanks for highlighting this! I looked into it and it seems the old version didn’t exclude non-productive chains. The problem is that
pd.read_csv
didn’t convert booleans correctly, which is usually handled by theairr
library.So the new version is correct!
Loading dataframes into
read_airr
is an unsupported feature (internally used forio.from_dandelion
) and all datatypes need to be correct to use it.Hi, I installed the development version and the import works! 👍 I noticed some slight differences in the numbers in the adata object, e.g.
adata.obs['chain_pairing'].value_counts() #old import
adata.obs['chain_pairing'].value_counts() #new import
Is this expected? Maybe there is a good reason for this, just wanted to let you know.